In [1]:
%pip install pandas numpy scikit-learn tensorflow tqdm imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
version = 2

# Models

The current models being implemented are namely:

* Supervised
    1. Logistic Regression Model
    2. Random Forest Classifier
    3. Support Vector Machine
    4. Deep Neural Network
    5. (New) Bayesian Network
    6. (New) Gradient Boosting

* Unsupervised
    1. (New) K-Means Clustering
    2. (New) Kernel Density Estimation

In [3]:
import importlib
import pandas as pd

# Selecting column names for CESD-10 Scale related features
cesd_col_names = ["_a_emobth", "_a_emomnd", "_a_emodep", "_a_emoeff", "_a_emohope",
                "_a_emofear", "_a_emoslp", "_a_emohap", "_a_emolone", "_a_emogo"]

cesd_col_names_w1 = ["w1" + col for col in cesd_col_names]
cesd_col_names_w2 = ["w2" + col for col in cesd_col_names]
cesd_col_names_w3 = ["w3" + col for col in cesd_col_names]
cesd_col_names_w4 = ["w4" + col for col in cesd_col_names]
cesd_col_names_w5 = ["w5" + col for col in cesd_col_names]

### Prepping dataset

In [4]:
df1 = pd.read_csv("CSV/wave1_select_labelled.csv")
df1 = df1.drop(columns=cesd_col_names_w1).drop(columns=['score'])
print(df1['pid'].count())

df2 = pd.read_csv("CSV/wave2_select_labelled.csv")
df2 = df2.drop(columns=cesd_col_names_w2).drop(columns=['score'])
print(df2['pid'].count())

df3 = pd.read_csv("CSV/wave3_select_labelled.csv")
df3 = df3.drop(columns=cesd_col_names_w3).drop(columns=['score'])
print(df3['pid'].count())

df4 = pd.read_csv("CSV/wave4_select_labelled.csv")
df4 = df4.drop(columns=cesd_col_names_w4).drop(columns=['score'])
print(df4['pid'].count())

df5 = pd.read_csv("CSV/wave5_select_labelled.csv")
df5 = df5.drop(columns=cesd_col_names_w5).drop(columns=['score'])
print(df5['pid'].count())

combined_df = pd.concat([df1, df2, df3, df4, df5], axis=0, ignore_index=True)
print(combined_df['pid'].count())
print(combined_df.columns, f"\n{len(combined_df.columns) - 1} columns")

# print(combined_df.describe())

14156
14589
14746
19294
18779
81564
Index(['pid', 'age', 'gender', 'race', 'marital_status', 'born_province',
       'employed', 'employed_take_home', 'employed_weekly_hours',
       'self_employed', 'self_employed_take_home',
       'self_employed_weekly_hours', 'casual_work', 'casual_weekly_hours',
       'highest_grade_school', 'tertiary_education', 'currently_enrolled',
       'fever', 'persistent_cough', 'cough_with_blood', 'chest_pain',
       'body_ache', 'headache', 'back_ache', 'joint_pain_arthritis',
       'diarrhoea', 'painful_urination', 'swelling_ankles',
       'severe_weight_loss', 'time_since_prev_consulation', 'had_tubercolosis',
       'had_high_blood_pressure', 'had_diabetes_or_high_blood_sugar',
       'had_stroke', 'had_asthma', 'had_heart_problems', 'had_cancer',
       'exercise_frequency', 'smokes_cigarettes', 'height_measurement',
       'weight_measurement', 'waist_measurement', 'depressed'],
      dtype='object') 
42 columns


### Additional Preprocessing

In [5]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import plotter

if version == 0:
    # Split data into features (X) and target (y)
    X = combined_df.drop('depressed', axis=1)
    y = combined_df['depressed']

    # Apply SMOTE for oversampling or RandomUnderSampler for undersampling
    smote = SMOTE()
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Or for undersampling
    # undersample = RandomUnderSampler()
    # X_resampled, y_resampled = undersample.fit_resample(X, y)

    # Combine back into a balanced dataframe
    df_balanced = pd.concat([X_resampled, y_resampled], axis=1)
    combined_df = df_balanced

    print(combined_df['pid'].count())

    plotter.plot_bar(combined_df['depressed'], "Distribution of depression count after sampling")

if version == 1:
    # Separate majority and minority classes
    majority_class = combined_df[combined_df['depressed'] == 0]
    minority_class = combined_df[combined_df['depressed'] == 1]

    # Oversample the minority class to match the number of majority class samples
    oversampled_minority = minority_class.sample(len(majority_class), replace=True)

    # Concatenate the majority class with the oversampled minority class
    df_balanced = pd.concat([majority_class, oversampled_minority])

    # Shuffle the dataset
    df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)

    combined_df = df_balanced

    plotter.plot_bar(combined_df['depressed'], "Distribution of depression count after sampling")

Precision = TP / TP + FP (How often are positive predictions correct?)
Recall = TP / TP + FN (Can an ML model find all instances of the positive class?)

### Logistic Regression

In [6]:
import logisticRegression

LR = logisticRegression.LogisticRegressionModel(combined_df, combined_df['depressed'])

LR.run()

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           42     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.93147D-01    |proj g|=  1.49439D+01


 This problem is unconstrained.



At iterate   50    f=  5.59693D-01    |proj g|=  8.58593D-02

At iterate  100    f=  5.56179D-01    |proj g|=  1.83556D-01

At iterate  150    f=  5.54530D-01    |proj g|=  7.29850D-02

At iterate  200    f=  5.53570D-01    |proj g|=  1.31997D-02

At iterate  250    f=  5.53012D-01    |proj g|=  7.66746D-02

At iterate  300    f=  5.52855D-01    |proj g|=  1.73373D-02

At iterate  350    f=  5.52764D-01    |proj g|=  4.64158D-03

At iterate  400    f=  5.52722D-01    |proj g|=  7.64689D-03

At iterate  450    f=  5.52710D-01    |proj g|=  1.17105D-02

At iterate  500    f=  5.52703D-01    |proj g|=  1.55752D-03

At iterate  550    f=  5.52700D-01    |proj g|=  8.17629D-04

At iterate  600    f=  5.52699D-01    |proj g|=  1.26905D-03

At iterate  650    f=  5.52698D-01    |proj g|=  4.17665D-03

At iterate  700    f=  5.52698D-01    |proj g|=  2.51417D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments 

### Deep Neural Network

In [7]:
import neuralNetwork

NN = neuralNetwork.DeepNeuralNetworkModel(combined_df, combined_df['depressed'])

NN.run()

2024-10-04 19:37:29.552846: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-04 19:37:29.684744: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-04 19:37:29.685725: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-10-04 19:37:33.387761: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-10-04 19:37:33.388462: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are 

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

### Random Forest

In [8]:
import randomForest
importlib.reload(randomForest)

RF = randomForest.RandomForestModel(combined_df, combined_df['depressed'])

RF.run()

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    4.8s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s


Validation Accuracy: 0.74
Validation Confusion Matrix:
[[11954   185]
 [ 3995   179]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.98      0.85     12139
           1       0.49      0.04      0.08      4174

    accuracy                           0.74     16313
   macro avg       0.62      0.51      0.47     16313
weighted avg       0.68      0.74      0.65     16313




[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s


Test Accuracy: 0.75
Test Confusion Matrix:
[[12043   193]
 [ 3883   194]]
Test Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.98      0.86     12236
           1       0.50      0.05      0.09      4077

    accuracy                           0.75     16313
   macro avg       0.63      0.52      0.47     16313
weighted avg       0.69      0.75      0.66     16313




### Support Vector Machine

In [9]:
import supportVectorMachine

SVM = supportVectorMachine.SVMModel(combined_df, combined_df['depressed'])

SVM.run()

[LibSVM]............................................................................................*..............................................................................................................*..*
optimization finished, #iter = 203615
obj = -23957.464978, rho = -0.598052
nSV = 30188, nBSV = 19940
Total nSV = 30188
Validation Accuracy: 0.75
Validation Confusion Matrix:
[[12102    37]
 [ 4108    66]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.75      1.00      0.85     12139
           1       0.64      0.02      0.03      4174

    accuracy                           0.75     16313
   macro avg       0.69      0.51      0.44     16313
weighted avg       0.72      0.75      0.64     16313


Test Accuracy: 0.75
Test Confusion Matrix:
[[12192    44]
 [ 4022    55]]
Test Classification Report:
              precision    recall  f1-score   support

           0       0.75      1.00      0.86     12236
      