In [1]:
%pip install pandas numpy scikit-learn tensorflow tqdm

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


# Models

The current models being implemented are namely:

* Supervised
    1. Logistic Regression Model
    2. Random Forest Classifier
    3. Support Vector Machine
    4. Deep Neural Network
    5. (New) Bayesian Network
    6. (New) Gradient Boosting

* Unsupervised
    1. (New) K-Means Clustering
    2. (New) Kernel Density Estimation

In [1]:
import importlib
import pandas as pd

# Selecting column names for CESD-10 Scale related features
cesd_col_names = ["_a_emobth", "_a_emomnd", "_a_emodep", "_a_emoeff", "_a_emohope",
                "_a_emofear", "_a_emoslp", "_a_emohap", "_a_emolone", "_a_emogo"]

cesd_col_names_w1 = ["w1" + col for col in cesd_col_names]
cesd_col_names_w2 = ["w2" + col for col in cesd_col_names]
cesd_col_names_w3 = ["w3" + col for col in cesd_col_names]
cesd_col_names_w4 = ["w4" + col for col in cesd_col_names]
cesd_col_names_w5 = ["w5" + col for col in cesd_col_names]

### Prepping dataset

In [3]:
df1 = pd.read_csv("CSV/wave1_select_labelled.csv")

df1 = df1.drop(columns=cesd_col_names_w1).drop(columns=['score'])
print(df1.describe())

df2 = pd.read_csv("CSV/wave2_select_labelled.csv")

df2 = df2.drop(columns=cesd_col_names_w2).drop(columns=['score'])

combined_df = pd.concat([df1, df2], axis=0, ignore_index=True)

# print(combined_df.describe())

                 pid           age        gender          race  \
count   14156.000000  14156.000000  14156.000000  14156.000000   
mean   315191.954436     38.544080      0.404069      0.772393   
std     14510.803162     17.934095      0.490728      1.499715   
min    301012.000000     13.000000      0.000000      0.000000   
25%    306547.750000     23.000000      0.000000      0.000000   
50%    313411.500000     35.000000      0.000000      0.000000   
75%    319369.250000     51.000000      1.000000      0.000000   
max    391310.000000    102.000000      1.000000      4.000000   

       marital_status  born_province      employed  employed_take_home  \
count    14156.000000   14156.000000  14156.000000        14156.000000   
mean         1.764835       2.306513      0.490534            0.228596   
std          0.994657       3.184596      0.859289            1.060856   
min          0.000000       0.000000      0.000000            0.000000   
25%          1.000000       0.00000

### Logistic Regression

In [3]:
import logisticRegression

LR = logisticRegression.LogisticRegressionModel(combined_df, combined_df['depressed'])

LR.run()

Validation Accuracy: 0.71
Validation Confusion Matrix:
[[3957  117]
 [1541  134]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.97      0.83      4074
           1       0.53      0.08      0.14      1675

    accuracy                           0.71      5749
   macro avg       0.63      0.53      0.48      5749
weighted avg       0.67      0.71      0.63      5749

Test Accuracy: 0.71
Test Confusion Matrix:
[[3940  121]
 [1528  160]]
Test Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.97      0.83      4061
           1       0.57      0.09      0.16      1688

    accuracy                           0.71      5749
   macro avg       0.64      0.53      0.49      5749
weighted avg       0.68      0.71      0.63      5749

Best Parameters: {'logisticregression__C': 10, 'logisticregression__penalty': 'l1'}
Best Score: 0.7186757598295662


### Deep Neural Network

In [4]:
import neuralNetwork

NN = neuralNetwork.DeepNeuralNetworkModel(combined_df, combined_df['depressed'])

NN.run()

2024-10-02 18:14:55.097329: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-02 18:14:55.173384: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-02 18:14:55.174856: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-10-02 18:15:03.471707: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-10-02 18:15:03.472373: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are 

Validation Accuracy: 0.69
Validation Confusion Matrix:
[[3687  387]
 [1394  281]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.91      0.81      4074
           1       0.42      0.17      0.24      1675

    accuracy                           0.69      5749
   macro avg       0.57      0.54      0.52      5749
weighted avg       0.64      0.69      0.64      5749

Test Accuracy: 0.70
Test Confusion Matrix:
[[3685  376]
 [1368  320]]
Test Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.91      0.81      4061
           1       0.46      0.19      0.27      1688

    accuracy                           0.70      5749
   macro avg       0.59      0.55      0.54      5749
weighted avg       0.65      0.70      0.65      5749

Fitting 3 folds for each of 216 candidates, totalling 648 fits


2024-10-02 18:17:48.532403: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2024-10-02 18:17:49.101915: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2024-10-02 18:17:49.315709: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would li

KeyboardInterrupt: 

### Random Forest

In [4]:
import randomForest
importlib.reload(randomForest)

RF = randomForest.RandomForestModel(combined_df, combined_df['depressed'])

RF.run()

Validation Accuracy: 0.71
Validation Confusion Matrix:
[[3921  153]
 [1494  181]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.96      0.83      4074
           1       0.54      0.11      0.18      1675

    accuracy                           0.71      5749
   macro avg       0.63      0.54      0.50      5749
weighted avg       0.67      0.71      0.64      5749

Test Accuracy: 0.71
Test Confusion Matrix:
[[3919  142]
 [1514  174]]
Test Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.97      0.83      4061
           1       0.55      0.10      0.17      1688

    accuracy                           0.71      5749
   macro avg       0.64      0.53      0.50      5749
weighted avg       0.67      0.71      0.63      5749



Testing: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}:  50%|█████     | 54/108 [13:44<13:44, 15.26s/it]


KeyboardInterrupt: 

### Support Vector Machine

In [4]:
import supportVectorMachine

SVM = supportVectorMachine.SVMModel(combined_df, combined_df['depressed'])

SVM.run()

Validation Accuracy: 0.71
Validation Confusion Matrix:
[[3981   93]
 [1577   98]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.98      0.83      4074
           1       0.51      0.06      0.11      1675

    accuracy                           0.71      5749
   macro avg       0.61      0.52      0.47      5749
weighted avg       0.66      0.71      0.62      5749

Test Accuracy: 0.71
Test Confusion Matrix:
[[3979   82]
 [1577  111]]
Test Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.98      0.83      4061
           1       0.58      0.07      0.12      1688

    accuracy                           0.71      5749
   macro avg       0.65      0.52      0.47      5749
weighted avg       0.67      0.71      0.62      5749



  0%|          | 0/144 [00:00<?, ?it/s]