# 1. **Dataset Analysis,EDA and model development**

**Original Dataset**: https://drive.google.com/file/d/1OJqKfwibxydSmus-bvPjhxJ0fKYWJDuI/view?usp=drive_link

**Information about dataset and column discriptions**: https://colab.research.google.com/drive/1dUJ0Y0R8RJb_8cIFeOXcFoLv4ech8e-n?usp=drive_link


In [None]:
import pandas as pd
d=pd.read_csv('/content/Updated_Depression_Dataset.csv')
d.head().T

Unnamed: 0,0,1,2,3,4
Age,23,39,28,65,33
SleepDuration,6.9,6.0,6.3,8.7,4.8
Genotype_5HTTLPR,Short/Short,Long/Long,Short/Short,Long/Long,Short/Short
Genotype_COMT,Val/Val,Met/Met,Val/Val,Val/Val,Val/Val
Genotype_MAOA,Low Activity,Low Activity,High Activity,Low Activity,Low Activity
Cortisol,9.6,10.3,16.1,9.0,10.9
BDNF_Level,13.04,14.13,10.84,14.45,9.53
CRP,1.87,1.17,2.91,1.71,2.35
Vitamin_D,-0.3,19.9,30.9,29.8,22.6
Tryptophan,56.8,32.9,59.9,32.1,51.7


In [None]:
#checking shape
d.shape

(1000, 18)

In [None]:
#value counts of target column
d["DepressionDiagnosis"].value_counts()

Unnamed: 0_level_0,count
DepressionDiagnosis,Unnamed: 1_level_1
False,500
Persistent Depressive Disorder,116
Major Depressive Disorder,103
Atypical Depression,95
Psychotic Depression,94
Seasonal Affective Disorder,92


In [None]:
#information about dataset
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      1000 non-null   int64  
 1   SleepDuration            1000 non-null   float64
 2   Genotype_5HTTLPR         1000 non-null   object 
 3   Genotype_COMT            1000 non-null   object 
 4   Genotype_MAOA            1000 non-null   object 
 5   Cortisol                 1000 non-null   float64
 6   BDNF_Level               1000 non-null   float64
 7   CRP                      1000 non-null   float64
 8   Vitamin_D                1000 non-null   float64
 9   Tryptophan               1000 non-null   float64
 10  Omega3_Index             1000 non-null   float64
 11  MTHFR_Genotype           1000 non-null   object 
 12  Neuroinflammation_Score  1000 non-null   float64
 13  Monoamine_Oxidase_Level  1000 non-null   float64
 14  Serotonin_Level          

In [None]:
#checking null/empty values
d.isna().sum()

Unnamed: 0,0
Age,0
SleepDuration,0
Genotype_5HTTLPR,0
Genotype_COMT,0
Genotype_MAOA,0
Cortisol,0
BDNF_Level,0
CRP,0
Vitamin_D,0
Tryptophan,0


In [None]:
#checking datatype of column
d['Genotype_MAOA'].dtype

dtype('O')

In [None]:
pd.api.types.is_object_dtype(d["Genotype_MAOA"])

True

In [None]:
#Finding the columns which contains string in our dataset
for label,content in d.items():
  if pd.api.types.is_object_dtype(content):
    print(label)

Genotype_5HTTLPR
Genotype_COMT
Genotype_MAOA
MTHFR_Genotype
DepressionDiagnosis


In [None]:
# converting strings to category format
for label,content in d.items():
  if pd.api.types.is_string_dtype(content) or pd.api.types.is_object_dtype(content):
    d[label]=content.astype("category").cat.as_ordered()

In [None]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   Age                      1000 non-null   int64   
 1   SleepDuration            1000 non-null   float64 
 2   Genotype_5HTTLPR         1000 non-null   category
 3   Genotype_COMT            1000 non-null   category
 4   Genotype_MAOA            1000 non-null   category
 5   Cortisol                 1000 non-null   float64 
 6   BDNF_Level               1000 non-null   float64 
 7   CRP                      1000 non-null   float64 
 8   Vitamin_D                1000 non-null   float64 
 9   Tryptophan               1000 non-null   float64 
 10  Omega3_Index             1000 non-null   float64 
 11  MTHFR_Genotype           1000 non-null   category
 12  Neuroinflammation_Score  1000 non-null   float64 
 13  Monoamine_Oxidase_Level  1000 non-null   float64 
 14  Serotonin

In [None]:
d.DepressionDiagnosis.cat.categories
d.Genotype_5HTTLPR.cat.categories
d.Genotype_COMT.cat.categories
d.Genotype_MAOA.cat.categories
d.MTHFR_Genotype.cat.categories

Index(['CC', 'CT', 'TT'], dtype='object')

In [None]:
#pandas storing `DepressionDiagnosis` as category rather than object types
d.DepressionDiagnosis.cat.codes
d.Genotype_5HTTLPR.cat.codes
d.Genotype_COMT.cat.codes
d.Genotype_MAOA.cat.codes
d.MTHFR_Genotype.cat.codes

Unnamed: 0,0
0,2
1,0
2,1
3,1
4,2
...,...
995,2
996,0
997,2
998,1


In [None]:
#creating new column with numerical codes for all string columns

# Making New Colums
d['DepressionDiagnosis_Codes']=d['DepressionDiagnosis'].cat.codes
d['Genotype_5HTTLPR_Codes']=d['Genotype_5HTTLPR'].cat.codes
d['Genotype_COMT_Codes']=d['Genotype_COMT'].cat.codes
d['Genotype_MAOA_Codes']=d['Genotype_MAOA'].cat.codes
d['MTHFR_Genotype_Codes']=d['MTHFR_Genotype'].cat.codes

# Droping the original string colum
d=d.drop('DepressionDiagnosis',axis=1)
d=d.drop('Genotype_5HTTLPR',axis=1)
d=d.drop('Genotype_COMT',axis=1)
d=d.drop('Genotype_MAOA',axis=1)
d=d.drop('MTHFR_Genotype',axis=1)

In [None]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        1000 non-null   int64  
 1   SleepDuration              1000 non-null   float64
 2   Cortisol                   1000 non-null   float64
 3   BDNF_Level                 1000 non-null   float64
 4   CRP                        1000 non-null   float64
 5   Vitamin_D                  1000 non-null   float64
 6   Tryptophan                 1000 non-null   float64
 7   Omega3_Index               1000 non-null   float64
 8   Neuroinflammation_Score    1000 non-null   float64
 9   Monoamine_Oxidase_Level    1000 non-null   float64
 10  Serotonin_Level            1000 non-null   float64
 11  HPA_Axis_Dysregulation     1000 non-null   float64
 12  DepressionScore_PHQ9       1000 non-null   int64  
 13  DepressionDiagnosis_Codes  1000 non-null   int8  

In [None]:
d.head().T

Unnamed: 0,0,1,2,3,4
Age,23.0,39.0,28.0,65.0,33.0
SleepDuration,6.9,6.0,6.3,8.7,4.8
Cortisol,9.6,10.3,16.1,9.0,10.9
BDNF_Level,13.04,14.13,10.84,14.45,9.53
CRP,1.87,1.17,2.91,1.71,2.35
Vitamin_D,-0.3,19.9,30.9,29.8,22.6
Tryptophan,56.8,32.9,59.9,32.1,51.7
Omega3_Index,3.19,5.65,5.09,7.71,2.56
Neuroinflammation_Score,0.1,0.79,0.53,0.29,0.66
Monoamine_Oxidase_Level,2.73,2.8,2.19,1.64,2.49


In [None]:
# Save to Colab's temporary storage
d.to_csv('modified_Depression.csv', index=False)

In [None]:
df=pd.read_csv('modified_Depression.csv')
df.head()

Unnamed: 0,Age,SleepDuration,Cortisol,BDNF_Level,CRP,Vitamin_D,Tryptophan,Omega3_Index,Neuroinflammation_Score,Monoamine_Oxidase_Level,Serotonin_Level,HPA_Axis_Dysregulation,DepressionScore_PHQ9,DepressionDiagnosis_Codes,Genotype_5HTTLPR_Codes,Genotype_COMT_Codes,Genotype_MAOA_Codes,MTHFR_Genotype_Codes
0,23,6.9,9.6,13.04,1.87,-0.3,56.8,3.19,0.1,2.73,117.4,0.3,4,3,2,2,1,2
1,39,6.0,10.3,14.13,1.17,19.9,32.9,5.65,0.79,2.8,106.4,0.01,20,1,0,0,1,0
2,28,6.3,16.1,10.84,2.91,30.9,59.9,5.09,0.53,2.19,133.0,0.83,0,3,2,2,0,1
3,65,8.7,9.0,14.45,1.71,29.8,32.1,7.71,0.29,1.64,123.1,0.09,16,0,0,2,1,1
4,33,4.8,10.9,9.53,2.35,22.6,51.7,2.56,0.66,2.49,188.9,0.63,23,1,2,2,1,2


## Original Dataset

Using the original data for EDA and model training.

### **Random Forest, Logistic Regression and KNN**
Accuracy is as followed:

* 'Random Forest': `0.515`
* 'Logistic Regression': `0.51`
* 'KNN': `0.415`

In [None]:
import numpy as np

#Models from scikit-learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

#model evaluation
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import precision_score,recall_score,f1_score
from sklearn.metrics import RocCurveDisplay

In [None]:
# Splitting the dataset
X=df.drop("DepressionDiagnosis_Codes",axis=1)
y=df["DepressionDiagnosis_Codes"]

np.random.seed(42)
# Splitting the data into train and test sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [None]:
#Using three ML models
models={"Random Forest":RandomForestClassifier(),
        "Logistic Regression":LogisticRegression(),
        "KNN":KNeighborsClassifier()}

#function to fit and score matrix

def fit_and_score(models,X_train,X_test,y_train,y_test):
  """
    Fits and evaluates given machine learning models
    models:a dictionary of different scikit-learn models
    X_train:Trainnig data (No labels)
    X_test:Testing data (No Labels)
    y_train:trainning labels
    y_test:test labels

  """

  #setting up a randomseed(42)
  np.random.seed(42)

  #dictionary for storing models score
  model_scores={}

  #looping throigh each model
  for name,model in models.items():
    # fit the model to data
    model.fit(X_train,y_train)

    #evaluate model and append score
    model_scores[name]=model.score(X_test,y_test)

  return model_scores


In [None]:
model_scores=fit_and_score(models,X_train,X_test,y_train,y_test);
model_scores

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Random Forest': 0.515, 'Logistic Regression': 0.51, 'KNN': 0.415}

### **SMOTE**
Accuracy is `0.445`

In [None]:
!pip install imbalanced-learn



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

In [None]:
# Step 1: Load dataset
df = pd.read_csv("Updated_Depression_Dataset.csv")

In [None]:
df.head()

Unnamed: 0,Age,SleepDuration,Genotype_5HTTLPR,Genotype_COMT,Genotype_MAOA,Cortisol,BDNF_Level,CRP,Vitamin_D,Tryptophan,Omega3_Index,MTHFR_Genotype,Neuroinflammation_Score,Monoamine_Oxidase_Level,Serotonin_Level,HPA_Axis_Dysregulation,DepressionScore_PHQ9,DepressionDiagnosis
0,23,6.9,Short/Short,Val/Val,Low Activity,9.6,13.04,1.87,-0.3,56.8,3.19,TT,0.1,2.73,117.4,0.3,4,Persistent Depressive Disorder
1,39,6.0,Long/Long,Met/Met,Low Activity,10.3,14.13,1.17,19.9,32.9,5.65,CC,0.79,2.8,106.4,0.01,20,False
2,28,6.3,Short/Short,Val/Val,High Activity,16.1,10.84,2.91,30.9,59.9,5.09,CT,0.53,2.19,133.0,0.83,0,Persistent Depressive Disorder
3,65,8.7,Long/Long,Val/Val,Low Activity,9.0,14.45,1.71,29.8,32.1,7.71,CT,0.29,1.64,123.1,0.09,16,Atypical Depression
4,33,4.8,Short/Short,Val/Val,Low Activity,10.9,9.53,2.35,22.6,51.7,2.56,TT,0.66,2.49,188.9,0.63,23,False


In [None]:
# Step 2: Encode target label
target_col = "DepressionDiagnosis"
label_encoder = LabelEncoder()
df[target_col] = label_encoder.fit_transform(df[target_col])

In [None]:
# Step 3: Separate features and target
X = df.drop(target_col, axis=1)
y = df[target_col]

In [None]:
# Step 4: Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

In [None]:
# Step 5: Preprocessing for categorical variables
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder='passthrough')

In [None]:
# Step 6: Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# Step 7: Create pipeline with SMOTE and model
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [None]:
# Step 8: Train the model
pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
# Step 9: Evaluate on test set
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

In [None]:
# Step 10: Predict a sample
sample = X.iloc[[0]]
predicted_class = pipeline.predict(sample)
predicted_label = label_encoder.inverse_transform(predicted_class)

In [None]:
accuracy

0.445

In [None]:
report

'                                precision    recall  f1-score   support\n\n           Atypical Depression       0.00      0.00      0.00        19\n                         False       0.50      0.88      0.64       100\n     Major Depressive Disorder       0.00      0.00      0.00        21\nPersistent Depressive Disorder       0.00      0.00      0.00        23\n          Psychotic Depression       0.12      0.05      0.07        19\n   Seasonal Affective Disorder       0.00      0.00      0.00        18\n\n                      accuracy                           0.45       200\n                     macro avg       0.10      0.16      0.12       200\n                  weighted avg       0.26      0.45      0.32       200\n'

In [None]:
predicted_label[0]

'Persistent Depressive Disorder'

### **XGBoost**

Accuracy : `0.370`

In [None]:
!pip install imbalanced-learn xgboost



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, accuracy_score
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [None]:
# Load your data
df = pd.read_csv("Updated_Depression_Dataset.csv")

In [None]:
# Separate features and target
X = df.drop("DepressionDiagnosis", axis=1)
y = df["DepressionDiagnosis"]

In [None]:
# Encode target
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [None]:
# Preprocessing
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

In [None]:
# Full pipeline with feature selection, SMOTE, and XGBoost
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif, k=10)),
    ('smote', SMOTE(random_state=42)),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42))
])

In [None]:
# Hyperparameter space
param_dist = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.3],
    'classifier__subsample': [0.8, 1],
    'classifier__colsample_bytree': [0.8, 1]
}

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
# Randomized Search
search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=10,
    scoring='accuracy',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)


In [None]:
# Train
search.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


Parameters: { "use_label_encoder" } are not used.



In [None]:
# Predict
y_pred = search.best_estimator_.predict(X_test)

In [None]:
# Evaluate
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

In [None]:
print(f"Best Accuracy: {accuracy:.3f}")
print(report)

Best Accuracy: 0.370
                                precision    recall  f1-score   support

           Atypical Depression       0.18      0.11      0.13        19
                         False       0.48      0.67      0.56       100
     Major Depressive Disorder       0.09      0.05      0.06        21
Persistent Depressive Disorder       0.10      0.04      0.06        23
          Psychotic Depression       0.06      0.05      0.06        19
   Seasonal Affective Disorder       0.15      0.11      0.13        18

                      accuracy                           0.37       200
                     macro avg       0.18      0.17      0.17       200
                  weighted avg       0.30      0.37      0.32       200



## Balanced/Modified Dataset

**Updated/Modified/Balanced Dataset**: https://drive.google.com/file/d/1a1a8DytnbJx7SVONqIeHrfuO9cxmkTih/view?usp=sharing


Manually updating data to improve our model results by balancing each class equally i.e each class has nearly same totally records

### **XGBoost**

Accuracy is `65`

In [None]:
df=pd.read_csv('/content/Balanced_Depression_Dataset.csv')
df.head()

Unnamed: 0,Age,SleepDuration,Cortisol,BDNF_Level,CRP,Vitamin_D,Tryptophan,Omega3_Index,Neuroinflammation_Score,Monoamine_Oxidase_Level,Serotonin_Level,HPA_Axis_Dysregulation,DepressionScore_PHQ9,Genotype_5HTTLPR,Genotype_COMT,Genotype_MAOA,MTHFR_Genotype,DepressionDiagnosis
0,62,4.4,14.2,6.88,3.51,15.5,46.8,9.15,0.19,2.97,156.3,0.3,26,Long/Long,Val/Met,High Activity,CC,False
1,24,9.0,12.1,13.77,2.12,27.1,31.2,2.4,0.4,2.83,113.8,0.06,2,Short/Long,Val/Val,High Activity,CC,Seasonal Affective Disorder
2,30,5.4,9.9,21.59,-0.64,24.9,42.1,8.9,0.76,1.48,176.8,0.47,6,Short/Long,Val/Val,High Activity,CC,Persistent Depressive Disorder
3,43,5.8,17.5,6.3,5.51,30.3,54.1,8.88,0.59,1.96,172.9,0.27,14,Short/Short,Val/Met,Low Activity,TT,Atypical Depression
4,46,7.1,23.8,10.08,4.33,21.2,41.7,7.68,0.61,3.23,155.0,0.17,3,Long/Long,Val/Met,High Activity,CC,False


In [None]:
df["DepressionDiagnosis"].value_counts()

Unnamed: 0_level_0,count
DepressionDiagnosis,Unnamed: 1_level_1
False,150
Seasonal Affective Disorder,150
Persistent Depressive Disorder,150
Atypical Depression,150
Psychotic Depression,150
Major Depressive Disorder,150


In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score


In [None]:
# Separate features and target
X = df.drop("DepressionDiagnosis", axis=1)
y = df["DepressionDiagnosis"]

In [None]:
# Encode the target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:

# Identify categorical and numeric features
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()


In [None]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

In [None]:
# Define XGBoost model with reasonable parameters
xgb_model = XGBClassifier(
    n_estimators=50,
    max_depth=4,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)

In [None]:

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb_model)
])

In [None]:
# 5-Fold Stratified Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X, y_encoded, cv=cv, scoring='accuracy')


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [None]:
# Train/test split to evaluate on one holdout set
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)


Parameters: { "use_label_encoder" } are not used.



In [None]:

# Output results
print(f"Cross-validation accuracies: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.3f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Cross-validation accuracies: [0.65       0.61111111 0.61666667 0.66111111 0.6       ]
Mean CV Accuracy: 0.628
Test Accuracy: 0.650

Classification Report:
                                precision    recall  f1-score   support

           Atypical Depression       0.68      0.77      0.72        30
                         False       0.19      0.13      0.16        30
     Major Depressive Disorder       0.65      0.73      0.69        30
Persistent Depressive Disorder       0.67      0.80      0.73        30
          Psychotic Depression       0.93      0.83      0.88        30
   Seasonal Affective Disorder       0.68      0.63      0.66        30

                      accuracy                           0.65       180
                     macro avg       0.63      0.65      0.64       180
                  weighted avg       0.63      0.65      0.64       180



### **Random Forest, Logistic Regression and KNN**

Accuracy is as follows:

* 'Random Forest': `0.71`
* 'Logistic Regression': `0.24`
* 'KNN': `0.37`

In [None]:
import pandas as pd
d=pd.read_csv('/content/Balanced_Depression_Dataset.csv')
d.head().T

Unnamed: 0,0,1,2,3,4
Age,62,24,30,43,46
SleepDuration,4.4,9.0,5.4,5.8,7.1
Cortisol,14.2,12.1,9.9,17.5,23.8
BDNF_Level,6.88,13.77,21.59,6.3,10.08
CRP,3.51,2.12,-0.64,5.51,4.33
Vitamin_D,15.5,27.1,24.9,30.3,21.2
Tryptophan,46.8,31.2,42.1,54.1,41.7
Omega3_Index,9.15,2.4,8.9,8.88,7.68
Neuroinflammation_Score,0.19,0.4,0.76,0.59,0.61
Monoamine_Oxidase_Level,2.97,2.83,1.48,1.96,3.23


In [None]:
#information about dataset
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      900 non-null    int64  
 1   SleepDuration            900 non-null    float64
 2   Cortisol                 900 non-null    float64
 3   BDNF_Level               900 non-null    float64
 4   CRP                      900 non-null    float64
 5   Vitamin_D                900 non-null    float64
 6   Tryptophan               900 non-null    float64
 7   Omega3_Index             900 non-null    float64
 8   Neuroinflammation_Score  900 non-null    float64
 9   Monoamine_Oxidase_Level  900 non-null    float64
 10  Serotonin_Level          900 non-null    float64
 11  HPA_Axis_Dysregulation   900 non-null    float64
 12  DepressionScore_PHQ9     900 non-null    int64  
 13  Genotype_5HTTLPR         900 non-null    object 
 14  Genotype_COMT            9

In [None]:
#checking datatype of column
d['Genotype_MAOA'].dtype

dtype('O')

In [None]:
pd.api.types.is_object_dtype(d["Genotype_MAOA"])

True

In [None]:
#Finding the columns which contains string in our dataset
for label,content in d.items():
  if pd.api.types.is_object_dtype(content):
    print(label)

Genotype_5HTTLPR
Genotype_COMT
Genotype_MAOA
MTHFR_Genotype
DepressionDiagnosis


In [None]:
# converting strings to category format
for label,content in d.items():
  if pd.api.types.is_string_dtype(content) or pd.api.types.is_object_dtype(content):
    d[label]=content.astype("category").cat.as_ordered()

In [None]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   Age                      900 non-null    int64   
 1   SleepDuration            900 non-null    float64 
 2   Cortisol                 900 non-null    float64 
 3   BDNF_Level               900 non-null    float64 
 4   CRP                      900 non-null    float64 
 5   Vitamin_D                900 non-null    float64 
 6   Tryptophan               900 non-null    float64 
 7   Omega3_Index             900 non-null    float64 
 8   Neuroinflammation_Score  900 non-null    float64 
 9   Monoamine_Oxidase_Level  900 non-null    float64 
 10  Serotonin_Level          900 non-null    float64 
 11  HPA_Axis_Dysregulation   900 non-null    float64 
 12  DepressionScore_PHQ9     900 non-null    int64   
 13  Genotype_5HTTLPR         900 non-null    category
 14  Genotype_C

In [None]:
#creating new column with numerical codes for all string columns

# Making New Colums
d['DepressionDiagnosis_Codes']=d['DepressionDiagnosis'].cat.codes
d['Genotype_5HTTLPR_Codes']=d['Genotype_5HTTLPR'].cat.codes
d['Genotype_COMT_Codes']=d['Genotype_COMT'].cat.codes
d['Genotype_MAOA_Codes']=d['Genotype_MAOA'].cat.codes
d['MTHFR_Genotype_Codes']=d['MTHFR_Genotype'].cat.codes

# Droping the original string colum
d=d.drop('DepressionDiagnosis',axis=1)
d=d.drop('Genotype_5HTTLPR',axis=1)
d=d.drop('Genotype_COMT',axis=1)
d=d.drop('Genotype_MAOA',axis=1)
d=d.drop('MTHFR_Genotype',axis=1)

In [None]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        900 non-null    int64  
 1   SleepDuration              900 non-null    float64
 2   Cortisol                   900 non-null    float64
 3   BDNF_Level                 900 non-null    float64
 4   CRP                        900 non-null    float64
 5   Vitamin_D                  900 non-null    float64
 6   Tryptophan                 900 non-null    float64
 7   Omega3_Index               900 non-null    float64
 8   Neuroinflammation_Score    900 non-null    float64
 9   Monoamine_Oxidase_Level    900 non-null    float64
 10  Serotonin_Level            900 non-null    float64
 11  HPA_Axis_Dysregulation     900 non-null    float64
 12  DepressionScore_PHQ9       900 non-null    int64  
 13  DepressionDiagnosis_Codes  900 non-null    int8   

In [None]:
d.head().T

Unnamed: 0,0,1,2,3,4
Age,62.0,24.0,30.0,43.0,46.0
SleepDuration,4.4,9.0,5.4,5.8,7.1
Cortisol,14.2,12.1,9.9,17.5,23.8
BDNF_Level,6.88,13.77,21.59,6.3,10.08
CRP,3.51,2.12,-0.64,5.51,4.33
Vitamin_D,15.5,27.1,24.9,30.3,21.2
Tryptophan,46.8,31.2,42.1,54.1,41.7
Omega3_Index,9.15,2.4,8.9,8.88,7.68
Neuroinflammation_Score,0.19,0.4,0.76,0.59,0.61
Monoamine_Oxidase_Level,2.97,2.83,1.48,1.96,3.23


In [None]:
# Save to Colab's temporary storage
d.to_csv('modified_Depression.csv', index=False)

In [None]:
df=pd.read_csv('modified_Depression.csv')
df.head()

Unnamed: 0,Age,SleepDuration,Cortisol,BDNF_Level,CRP,Vitamin_D,Tryptophan,Omega3_Index,Neuroinflammation_Score,Monoamine_Oxidase_Level,Serotonin_Level,HPA_Axis_Dysregulation,DepressionScore_PHQ9,DepressionDiagnosis_Codes,Genotype_5HTTLPR_Codes,Genotype_COMT_Codes,Genotype_MAOA_Codes,MTHFR_Genotype_Codes
0,62,4.4,14.2,6.88,3.51,15.5,46.8,9.15,0.19,2.97,156.3,0.3,26,1,0,1,0,0
1,24,9.0,12.1,13.77,2.12,27.1,31.2,2.4,0.4,2.83,113.8,0.06,2,5,1,2,0,0
2,30,5.4,9.9,21.59,-0.64,24.9,42.1,8.9,0.76,1.48,176.8,0.47,6,3,1,2,0,0
3,43,5.8,17.5,6.3,5.51,30.3,54.1,8.88,0.59,1.96,172.9,0.27,14,0,2,1,1,2
4,46,7.1,23.8,10.08,4.33,21.2,41.7,7.68,0.61,3.23,155.0,0.17,3,1,0,1,0,0


In [None]:
# Splitting the dataset
X=df.drop("DepressionDiagnosis_Codes",axis=1)
y=df["DepressionDiagnosis_Codes"]

np.random.seed(42)
# Splitting the data into train and test sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [None]:
#Using three ML models
models={"Random Forest":RandomForestClassifier(),
        "Logistic Regression":LogisticRegression(),
        "KNN":KNeighborsClassifier()}

#function to fit and score matrix

def fit_and_score(models,X_train,X_test,y_train,y_test):
  """
    Fits and evaluates given machine learning models
    models:a dictionary of different scikit-learn models
    X_train:Trainnig data (No labels)
    X_test:Testing data (No Labels)
    y_train:trainning labels
    y_test:test labels

  """

  #setting up a randomseed(42)
  np.random.seed(42)

  #dictionary for storing models score
  model_scores={}

  #looping throigh each model
  for name,model in models.items():
    # fit the model to data
    model.fit(X_train,y_train)

    #evaluate model and append score
    model_scores[name]=model.score(X_test,y_test)

  return model_scores


In [None]:
model_scores=fit_and_score(models,X_train,X_test,y_train,y_test);
model_scores

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Random Forest': 0.7111111111111111,
 'Logistic Regression': 0.24444444444444444,
 'KNN': 0.37222222222222223}

### **Full pipeline with feature selection, SMOTE, and XGBoost**

Accuracy: `0.739`

In [None]:
# Load your data
df = pd.read_csv("/content/Balanced_Depression_Dataset.csv")

In [None]:
# Separate features and target
X = df.drop("DepressionDiagnosis", axis=1)
y = df["DepressionDiagnosis"]

In [None]:
# Encode target
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [None]:
# Preprocessing
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

In [None]:
# Full pipeline with feature selection, SMOTE, and XGBoost
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif, k=10)),
    ('smote', SMOTE(random_state=42)),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42))
])

In [None]:
# Hyperparameter space
param_dist = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.3],
    'classifier__subsample': [0.8, 1],
    'classifier__colsample_bytree': [0.8, 1]
}

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
# Randomized Search
search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=10,
    scoring='accuracy',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)


In [None]:
# Train
search.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


Parameters: { "use_label_encoder" } are not used.



In [None]:
# Predict
y_pred = search.best_estimator_.predict(X_test)

In [None]:
# Evaluate
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

In [None]:
print(f"Best Accuracy: {accuracy:.3f}")
print(report)

Best Accuracy: 0.739
                                precision    recall  f1-score   support

           Atypical Depression       0.75      0.90      0.82        30
                         False       0.47      0.30      0.37        30
     Major Depressive Disorder       0.65      0.80      0.72        30
Persistent Depressive Disorder       0.73      0.80      0.76        30
          Psychotic Depression       0.96      0.87      0.91        30
   Seasonal Affective Disorder       0.82      0.77      0.79        30

                      accuracy                           0.74       180
                     macro avg       0.73      0.74      0.73       180
                  weighted avg       0.73      0.74      0.73       180



### **RandomForest + SMOTE** (Highest Accuracy)

Accuracy : `0.75`

In [None]:
# Step 1: Load dataset
df = pd.read_csv("/content/Balanced_Depression_Dataset.csv")

In [None]:
# Step 2: Encode target label
target_col = "DepressionDiagnosis"
label_encoder = LabelEncoder()
df[target_col] = label_encoder.fit_transform(df[target_col])

In [None]:
# Step 3: Separate features and target
X = df.drop(target_col, axis=1)
y = df[target_col]

In [None]:
# Step 4: Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

In [None]:
# Step 5: Preprocessing for categorical variables
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder='passthrough')

In [None]:
# Step 6: Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# Step 7: Create pipeline with SMOTE and model
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [None]:
# Step 8: Train the model
pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
# Step 9: Evaluate
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Accuracy: 0.7555555555555555
                                precision    recall  f1-score   support

           Atypical Depression       0.76      0.87      0.81        30
                         False       0.46      0.43      0.45        30
     Major Depressive Disorder       0.74      0.83      0.78        30
Persistent Depressive Disorder       0.83      0.80      0.81        30
          Psychotic Depression       0.90      0.87      0.88        30
   Seasonal Affective Disorder       0.85      0.73      0.79        30

                      accuracy                           0.76       180
                     macro avg       0.76      0.76      0.75       180
                  weighted avg       0.76      0.76      0.75       180



## **Prediction**

In [None]:
## Step 10: Prediction on user given data

def predict_depression():
    """These function predicts depression from values such as
    Age: Age of a person (int)
    SleepDuration: Duration of sleep in hours (float)
    Genotype_5HTTLPR: Genotype of a person ('LL', 'LS', 'SS')
    Genotype_COMT: Genotype of a person ('Val/Val', 'Met/Met', 'Val/Met')
    Genotype_MAOA: Genotype of a person ('Low', 'High')
    Cortisol: Cortisol level of a person (float)
    BDNF_Level: BDNF level of a person (float)
    CRP: CRP level of a person (float)
    Vitamin_D: Vitamin D level of a person (float)
    Tryptophan: Tryptophan level of a person (float)
    Omega3_Index: Omega-3 index of a person (float)
    MTHFR_Genotype: Genotype of a person ('CC', 'CT', 'TT')
    Neuroinflammation_Score: Neuroinflammation score of a person (float)
    Monoamine_Oxidase_Level: Monoamine Oxidase level of a person (float)
    Serotonin_Level: Serotonin level of a person (float)
    HPA_Axis_Dysregulation: HPA Axis Dysregulation of a person (float)
    DepressionScore_PHQ9: Depression Score (PHQ-9) of a person (int)
    """

    input_data = {
        "Age": int(input("Enter  Age: (int) ")),
        "SleepDuration": float(input("Enter sleep duration: (float)")),
        "Genotype_5HTTLPR": input("Enter Genotype_5HTTLPR ('LL', 'LS', 'SS'): "),
        "Genotype_COMT": input("Enter Genotype_COMT ('Val/Val', 'Met/Met', 'Val/Met'): "),
        "Genotype_MAOA": input("Enter Genotype_MAOA ('Low', 'High'): "),
        "Cortisol": float(input("Enter Cortisol level: (float) ")),
        "BDNF_Level": float(input("Enter BDNF_Level: (float) ")),
        "CRP": float(input("Enter CRP: (float) ")),
        "Vitamin_D": float(input("Enter Vitamin_D level: (float)")),
        "Tryptophan": float(input("Enter Tryptophan: (float)")),
        "Omega3_Index": float(input("Enter Omega3_Index: (float)")),
        "MTHFR_Genotype": input("Enter MTHFR_Genotype ('CC', 'CT', 'TT'): "),
        "Neuroinflammation_Score": float(input("Enter Neuroinflammation_Score: (float)")),
        "Monoamine_Oxidase_Level": float(input("Enter Monoamine_Oxidase_Level: (float)")),
        "Serotonin_Level": float(input("Enter Serotonin_Level: (float)")),
        "HPA_Axis_Dysregulation": float(input("Enter HPA_Axis_Dysregulation: (float)")),
        "DepressionScore_PHQ9": int(input("Enter DepressionScore_PHQ9: (int)"))
    }

    # Convert to DataFrame
    user_df = pd.DataFrame([input_data])

    # Predict using the pipeline
    prediction = pipeline.predict(user_df)
    predicted_label = label_encoder.inverse_transform(prediction)

    print("\nPredicted Depression Diagnosis:", predicted_label[0])

In [None]:
predict_depression()

Enter  Age: 20
Enter sleep duration: 6.5
Enter Genotype_5HTTLPR (e.g., 'LL', 'LS', 'SS'): LS
Enter Genotype_COMT (e.g., 'Val/Val', 'Met/Met', 'Val/Met'): Met/Met
Enter Genotype_MAOA (e.g., 'Low', 'High'): Low
Enter Cortisol level: 17.5
Enter BDNF_Level: 6.88
Enter CRP: 3.51
Enter Vitamin_D level: 24.9
Enter Tryptophan: 46.8
Enter Omega3_Index: 9.78
Enter MTHFR_Genotype (e.g., 'CC', 'CT', 'TT'): TT
Enter Neuroinflammation_Score: 0.84
Enter Monoamine_Oxidase_Level: 1.78
Enter Serotonin_Level: 172.9
Enter HPA_Axis_Dysregulation: 0.21
Enter DepressionScore_PHQ9: 14

Predicted Depression Diagnosis: False


In [None]:
predict_depression()

Enter  Age: (int) 45
Enter sleep duration: (float)3.3
Enter Genotype_5HTTLPR ('LL', 'LS', 'SS'): LL
Enter Genotype_COMT ('Val/Val', 'Met/Met', 'Val/Met'): Val/Val
Enter Genotype_MAOA ('Low', 'High'): High
Enter Cortisol level: (float) 3.5
Enter BDNF_Level: (float) 7.5
Enter CRP: (float) 2.3
Enter Vitamin_D level: (float)12.2
Enter Tryptophan: (float)6.5
Enter Omega3_Index: (float)2.0
Enter MTHFR_Genotype ('CC', 'CT', 'TT'): CC
Enter Neuroinflammation_Score: (float)5.6
Enter Monoamine_Oxidase_Level: (float)1.2
Enter Serotonin_Level: (float)7.8
Enter HPA_Axis_Dysregulation: (float)0.1
Enter DepressionScore_PHQ9: (int)2

Predicted Depression Diagnosis: Seasonal Affective Disorder
