In [None]:
!pip install catboost

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
Mental_disorders_path = "/content/drive/MyDrive/senior project/Mental disorder symptoms.xlsx"
df = pd.read_excel(Mental_disorders_path)

In [None]:
df.describe()

In [None]:
a = df.Disorder.unique()
a

In [None]:
df = df.rename(columns={'ag+1:629e':'age'})
df = df.rename(columns={'having.trouble.in.sleeping':'trouble.sleeping'})
df = df.rename(columns={'having.trouble.with.work':'trouble.with.work'})
df = df.rename(columns={'having.nightmares':'nightmares'})

df.set_index(['age'])

In [None]:
# Histogram of Age Profile
sns.histplot(df.age, kde=True)

In [None]:
# Look for incidences of the various symptoms
symptom_incidences = df[df.columns[1:27]].sum()

In [None]:
symptom_incidences.plot.bar(figsize=(12,6), title = 'Incidences of Various Mental Health Symptoms')

In [None]:
# Get list of unique disorders
Disorders = df.Disorder.unique()
Disorders
# Note that this is reset later, to be given in the order used by the model

In [None]:
# Correct spelling mistakes
df.Disorder = df.Disorder.str.replace('psychotic deprission', 'psychotic depression')
df.Disorder = df.Disorder.str.replace('anexiety','anxiety')

In [None]:
symptoms = list(df.columns) # Likely to be useful later
symptoms.remove('age')

In [None]:
# Sum the total incidences of each diagnosed disorder.
incidences = df.Disorder.value_counts()
incidences.plot.bar()
# This assumes that each person has only one diagnosed disorder, and appears only once in the dataset.
# Secondary diagnoses - which are common - are ignored

In [None]:
# Look for NaN values:
df[df.isna().any(axis=1)]
# None found

In [None]:
# Look for any symptom incidence values that are neither 0 or 1
errors = df[(df.iloc[:,1:27] > 1).any(axis=1)]
errors
# None found

In [None]:
# Reminder before ML starts
df.head()

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
# X = df.drop(['Disorder'], axis=1)
# y = pd.get_dummies(df['Disorder'])
# Split the data into training and test sets
X = df.iloc[:, 1:27]
y = df.Disorder
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X.columns

In [None]:
# prompt: import catboost model

from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error
from sklearn.model_selection import train_test_split


# Initialize the CatBoostClassifier model
model = CatBoostClassifier(iterations=5000,
                         learning_rate=0.05,
                         depth=3,
                         loss_function='MultiClass',
                         eval_metric='Accuracy',
                         l2_leaf_reg=1)

# Train the model
model.fit(X_train, y_train,
         cat_features=list(range(len(X.columns))),
         verbose=False)

# Make predictions on the test set
y_pred = model.predict_proba(X_test)
y_predd = np.argmax(y_pred, axis=1)

##Evaluate the model performance
accuracy = accuracy_score(y_test, y_predd)
print('Catboost mse:', accuracy)
y_pred = y_pred.astype(y_test.dtype)
print(confusion_matrix(y_test, y_predd))
print(classification_report(y_test, y_predd))



In [None]:
import pickle

with open('catboost_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
# prompt: map a predicted value with types of disorder and select top 3 for all rows

predicted_disorders = model.predict(X_test)



probability = model.predict_proba(X_test)

  # Select top 3 predicted disorders for each row
top_3_disorders = pd.DataFrame(columns=['Disorder_1', 'Probability_1', 'Disorder_2', 'Probability_2', 'Disorder_3', 'Probability_3'])

for i in range(len(predicted_disorders)):
    top_disorder_indices = np.argsort(probability[i])[::-1][:3]
    top_disorders = [Disorders[idx] for idx in top_disorder_indices]
    top_probs = probability[i][top_disorder_indices]
    top_3_disorders.loc[i] = [
        top_disorders[0], top_probs[0],
        top_disorders[1], top_probs[1],
        top_disorders[2], top_probs[2]
    ]
print(top_3_disorders)
# Print the top 3 predicted disorders for all rows


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# 1. Prepare data
# Load your data into X and y variables

# 2. Initialize the model
my_model = DecisionTreeClassifier()

# 3. Train the model
my_model.fit(X_train, y_train)

# 4. Make predictions
y_pred = my_model.predict(X_test)

# 5. Evaluate the model
accuracy_dt = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy_dt)

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt

# Assuming df is already loaded
# df = pd.read_csv('your_file.csv')  # Example of loading a dataframe

# Prepare the data
X = df.iloc[:, 1:27]
y = df['Disorder']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train CatBoost model
catboost_model = CatBoostClassifier(verbose=0)  # verbose=0 to suppress output
catboost_model.fit(X_train, y_train)

# Predict and evaluate CatBoost model
y_pred_catboost = catboost_model.predict(X_test)
catboost_accuracy = accuracy_score(y_test, y_pred_catboost)
print(f"CatBoost Accuracy: {catboost_accuracy}")


# Plotting the accuracies
models = ['CatBoost', 'XGBoost']
accuracies = [catboost_accuracy, xgboost_accuracy]

plt.figure(figsize=(8, 5))
plt.bar(models, accuracies, color=['blue', 'green'])
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Model Accuracies')
plt.ylim(0, 1)  # Assuming accuracies are between 0 and 1
plt.show()