In [1]:
%pip install pandas numpy scikit-learn tensorflow tqdm

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


# Models

The current models being implemented are namely:

* Supervised
    1. Logistic Regression Model
    2. Random Forest Classifier
    3. Support Vector Machine
    4. Deep Neural Network
    5. (New) Bayesian Network
    6. (New) Gradient Boosting

* Unsupervised
    1. (New) K-Means Clustering
    2. (New) Kernel Density Estimation

In [2]:
import importlib
import pandas as pd

# Selecting column names for CESD-10 Scale related features
cesd_col_names = ["_a_emobth", "_a_emomnd", "_a_emodep", "_a_emoeff", "_a_emohope",
                "_a_emofear", "_a_emoslp", "_a_emohap", "_a_emolone", "_a_emogo"]

cesd_col_names_w1 = ["w1" + col for col in cesd_col_names]
cesd_col_names_w2 = ["w2" + col for col in cesd_col_names]
cesd_col_names_w3 = ["w3" + col for col in cesd_col_names]
cesd_col_names_w4 = ["w4" + col for col in cesd_col_names]
cesd_col_names_w5 = ["w5" + col for col in cesd_col_names]

### Prepping dataset

In [3]:
df1 = pd.read_csv("CSV/wave1_select_labelled.csv")
df1 = df1.drop(columns=cesd_col_names_w1).drop(columns=['score'])
print(df1['pid'].count())

df2 = pd.read_csv("CSV/wave2_select_labelled.csv")
df2 = df2.drop(columns=cesd_col_names_w2).drop(columns=['score'])
print(df2['pid'].count())

df3 = pd.read_csv("CSV/wave3_select_labelled.csv")
df3 = df3.drop(columns=cesd_col_names_w3).drop(columns=['score'])
print(df3['pid'].count())

df4 = pd.read_csv("CSV/wave4_select_labelled.csv")
df4 = df4.drop(columns=cesd_col_names_w4).drop(columns=['score'])
print(df4['pid'].count())

df5 = pd.read_csv("CSV/wave5_select_labelled.csv")
df5 = df5.drop(columns=cesd_col_names_w5).drop(columns=['score'])
print(df5['pid'].count())

combined_df = pd.concat([df1, df2, df3, df4, df5], axis=0, ignore_index=True)
print(combined_df['pid'].count())
print(combined_df.columns, f"\n{len(combined_df.columns) - 1} columns")

# print(combined_df.describe())

14156
14589
14746
19294
18779
81564
Index(['pid', 'age', 'gender', 'race', 'marital_status', 'born_province',
       'employed', 'employed_take_home', 'employed_weekly_hours',
       'self_employed', 'self_employed_take_home',
       'self_employed_weekly_hours', 'casual_work', 'casual_weekly_hours',
       'highest_grade_school', 'tertiary_education', 'currently_enrolled',
       'fever', 'persistent_cough', 'cough_with_blood', 'chest_pain',
       'body_ache', 'headache', 'back_ache', 'joint_pain_arthritis',
       'diarrhoea', 'painful_urination', 'swelling_ankles',
       'severe_weight_loss', 'time_since_prev_consulation',
       'exercise_frequency', 'smokes_cigarettes', 'height_measurement',
       'weight_measurement', 'waist_measurement', 'depressed'],
      dtype='object') 
36 columns


Precision = TP / TP + FP (How often are positive predictions correct?)
Recall = TP / TP + FN (Can an ML model find all instances of the positive class?)

### Logistic Regression

In [4]:
import logisticRegression

LR = logisticRegression.LogisticRegressionModel(combined_df, combined_df['depressed'])

LR.run()

Validation Accuracy: 0.74
Validation Confusion Matrix:
[[12099    40]
 [ 4139    35]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.75      1.00      0.85     12139
           1       0.47      0.01      0.02      4174

    accuracy                           0.74     16313
   macro avg       0.61      0.50      0.43     16313
weighted avg       0.67      0.74      0.64     16313

Test Accuracy: 0.75
Test Confusion Matrix:
[[12210    26]
 [ 4023    54]]
Test Classification Report:
              precision    recall  f1-score   support

           0       0.75      1.00      0.86     12236
           1       0.68      0.01      0.03      4077

    accuracy                           0.75     16313
   macro avg       0.71      0.51      0.44     16313
weighted avg       0.73      0.75      0.65     16313

Best Parameters: {'logisticregression__C': 1, 'logisticregression__penalty': 'l1'}
Best Score: 0.7479872468708126


### Deep Neural Network

In [5]:
import neuralNetwork

NN = neuralNetwork.DeepNeuralNetworkModel(combined_df, combined_df['depressed'])

NN.run()

2024-10-04 06:40:13.587110: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-04 06:40:13.626423: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-04 06:40:13.627813: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-10-04 06:40:22.949706: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-10-04 06:40:22.950392: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are 

Validation Accuracy: 0.74
Validation Confusion Matrix:
[[11621   518]
 [ 3793   381]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.96      0.84     12139
           1       0.42      0.09      0.15      4174

    accuracy                           0.74     16313
   macro avg       0.59      0.52      0.50     16313
weighted avg       0.67      0.74      0.67     16313

Test Accuracy: 0.74
Test Confusion Matrix:
[[11744   492]
 [ 3694   383]]
Test Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.96      0.85     12236
           1       0.44      0.09      0.15      4077

    accuracy                           0.74     16313
   macro avg       0.60      0.53      0.50     16313
weighted avg       0.68      0.74      0.68     16313



### Random Forest

In [6]:
import randomForest
importlib.reload(randomForest)

RF = randomForest.RandomForestModel(combined_df, combined_df['depressed'])

RF.run()

Validation Accuracy: 0.74
Validation Confusion Matrix:
[[11959   180]
 [ 3992   182]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.99      0.85     12139
           1       0.50      0.04      0.08      4174

    accuracy                           0.74     16313
   macro avg       0.63      0.51      0.47     16313
weighted avg       0.69      0.74      0.65     16313

Test Accuracy: 0.75
Test Confusion Matrix:
[[12072   164]
 [ 3889   188]]
Test Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.99      0.86     12236
           1       0.53      0.05      0.08      4077

    accuracy                           0.75     16313
   macro avg       0.65      0.52      0.47     16313
weighted avg       0.70      0.75      0.66     16313



Testing: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}: 100%|██████████| 108/108 [1:14:20<00:00, 41.30s/it]

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Best Score: 0.7498262981063468





### Support Vector Machine

In [7]:
import supportVectorMachine

SVM = supportVectorMachine.SVMModel(combined_df, combined_df['depressed'])

SVM.run()

Validation Accuracy: 0.75
Validation Confusion Matrix:
[[12102    37]
 [ 4107    67]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.75      1.00      0.85     12139
           1       0.64      0.02      0.03      4174

    accuracy                           0.75     16313
   macro avg       0.70      0.51      0.44     16313
weighted avg       0.72      0.75      0.64     16313

Test Accuracy: 0.75
Test Confusion Matrix:
[[12186    50]
 [ 4027    50]]
Test Classification Report:
              precision    recall  f1-score   support

           0       0.75      1.00      0.86     12236
           1       0.50      0.01      0.02      4077

    accuracy                           0.75     16313
   macro avg       0.63      0.50      0.44     16313
weighted avg       0.69      0.75      0.65     16313



Testing: {'C': 0.1, 'degree': 2, 'gamma': 0.1, 'kernel': 'linear'}:   6%|▌         | 8/144 [1:48:18<28:34:07, 756.23s/it]   