# Accessing the datasets

Files are originally in .dta files for use in Stata. Using the pandas library, they have been converted into .csv files for use in python.

In [None]:
import pandas as pd

data = pd.read_stata("Wave1/Adult_W1_Anon_V7.0.0.dta")
data.to_csv("wave1.csv", index=False)

data = pd.read_stata("Wave2/Adult_W2_Anon_V4.0.0.dta")
data.to_csv("wave2.csv", index=False)

data = pd.read_stata("Wave3/Adult_W3_Anon_V3.0.0.dta")
data.to_csv("wave3.csv", index=False)

data = pd.read_stata("Wave4/Adult_W4_Anon_V2.0.0.dta")
data.to_csv("wave4.csv", index=False)

data = pd.read_stata("Wave5/Adult_W5_Anon_V1.0.0.dta")
data.to_csv("wave5.csv", index=False)

# Preprocessing

Each wave must be preprocessed. This will include:
- Assigning labels according to the CESD-10 reporting scale
- Discretizing/classing continuous variables
- Feature Selection/Engineering
- Normalisation/Scaling

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


df = pd.read_csv("wave1.csv")

# Selecting column names for CESD-10 Scale related features
cesd_col_names = ["_a_emobth", "_a_emomnd", "_a_emodep", "_a_emoeff", "_a_emohope",
                "_a_emofear", "_a_emoslp", "_a_emohap", "_a_emolone", "_a_emogo"]

desc_col_names = ["w1_a_gen", "w1_a_dob_y", "w1_a_popgrp", "w1_a_marstt", "w1_a_mthali", "w1_a_em1", "w1_a_brnprov",
                  "w1_a_em1hrs", "w1_a_ems", "w1_a_emshrs", "w1_a_owncom", "w1_a_owncel", "w1_a_edschgrd",
                  "w1_a_edter", "w1_a_ed07sub", "w1_a_ed08cur", "w1_a_ed08cursub", "w1_a_hldes", "w1_a_hlcon",
                  ]

new_df = pd.DataFrame({})

# Rename specific columns
df = df.rename(columns={'w1_a_gen': 'gender',
                        'w1_a_dob_y': 'date_of_birth',
                        'w1_a_popgrp': 'race',
                        'w1_a_marstt': 'marital_status',
                        'w1_a_brnprov': 'living_province',
                        'w1_a_mthali': 'parents_alive',
                        'w1_a_em1': 'employed',
                        'w1_a_edschgrd': 'highest_grade_school',
                        'w1_a_edter': 'tertiary_education',
                        'w1_a_ed07att': 'attended_courses',
                        'w1_a_ed08cur': 'currently_enrolled'})

# Map the values to integers
df['gender'] = df['gender'].map({'Female': 0, 'Male': 1}).fillna(-1).astype(int)
new_df['gender'] = df['gender']

# Factorize the column, getting the integer codes and the unique values
# Replace -1 (which is the default code for NaN in factorize) with any value you want (e.g., -1)
df['race'] = df['race'].replace(pd.NA, 'Missing')
df['race'], unique_values = pd.factorize(df['race'])
new_df['race'] = df['race']

# Factorize the column, getting the integer codes and the unique values
df['marital_status'] = df['marital_status'].replace(pd.NA, 'Missing')
df['marital_status'], unique_values = pd.factorize(df['marital_status'])
new_df['marital_status'] = df['marital_status']

# Factorize the column, getting the integer codes and the unique values
df['living_province'] = df['living_province'].replace(pd.NA, 'Missing')
df['living_province'], unique_values = pd.factorize(df['living_province'])
new_df['living_province'] = df['living_province']

# Factorize the column, getting the integer codes and the unique values
df['parents_alive'] = df['parents_alive'].replace(pd.NA, 'Missing')
df['parents_alive'], unique_values = pd.factorize(df['parents_alive'])
new_df['parents_alive'] = df['parents_alive']

# Factorize the column, getting the integer codes and the unique values
# df['highest_grade_school'] = df['highest_grade_school'].replace(pd.NA, 'Missing')
# df['highest_grade_school'] = df['highest_grade_school'].replace('Not Applicable', 'Missing')
df['highest_grade_school'], unique_values = pd.factorize(df['highest_grade_school'])
new_df['highest_grade_school'] = df['highest_grade_school']

df['tertiary_education'], unique_values = pd.factorize(df['tertiary_education'])
new_df['tertiary_education'] = df['tertiary_education']

df['attended_courses'], unique_values = pd.factorize(df['attended_courses'])
new_df['attended_courses'] = df['attended_courses']

df['currently_enrolled'], unique_values = pd.factorize(df['currently_enrolled'])
new_df['currently_enrolled'] = df['currently_enrolled']

for i in cesd_col_names:
    new_df[f"w1{i}"] = df[f"w1{i}"]

new_df['w1_a_outcome'] = df['w1_a_outcome']
new_df['pid'] = df['pid']

# Assuming 'df' is your pandas DataFrame
new_df.to_csv('wave1_select.csv', index=False)

# print(df["highest_grade_school"].unique())

# Factorize the column, getting the integer codes and the unique values
# df['employed'] = df['employed'].replace(pd.NA, 'Missing')
# df['employed'], unique_values = pd.factorize(df['employed'])
# new_df['employed'] = df['employed']


  df = pd.read_csv("wave1.csv")


### Feature Selection

Each of the 5 waves will be converted into pandas dataframes. Waves 1, 3 and 5 will be used as training data. Waves 2 and 4 will be validation and testing data respectively.

Feature selection has been done as follows:
- Features relating to the CESD-10 reporting scale are selected.

In [8]:
import pandas as pd
from typing import List, Dict, Tuple
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)

'''
Creating a class for each wave.
'''

class Wave:
    def __init__(self, data: pd.DataFrame, select_cols: list):
        self.data : pd.DataFrame = data
        self.set_select_cols : List[str] = select_cols

    def set_select_cols(self, select_cols: list):
        self.set_select_cols = select_cols
    
    def get_select_cols(self):
        return self.set_select_cols
    
    def __str__(self):
        return str(self.data)

waves: List[Wave] = []

# Selecting column names for CESD-10 Scale related features
cesd_col_names = ["_a_emobth", "_a_emomnd", "_a_emodep", "_a_emoeff", "_a_emohope",
                "_a_emofear", "_a_emoslp", "_a_emohap", "_a_emolone", "_a_emogo"]

desc_col_names = ["w1_a_gen", "w1_a_dob_y", "w1_a_popgrp", "w1_a_marstt", "w1_a_mthali", "w1_a_em1",
                  "w1_a_em1hrs", "w1_a_ems", "w1_a_emshrs", "w1_a_owncom", "w1_a_owncel", "w1_a_edschgrd",
                  "w1_a_edter", "w1_a_ed07sub", "w1_a_ed08cur", "w1_a_ed08cursub", "w1_a_hldes", "w1_a_hlcon",
                  ]

'''
Running through each wave
'''
for i in range(1, 2):
    url = 'wave' + str(i) + '_select.csv'
    print(url)
    data = pd.read_csv(url)

    # Header text for each column based on wave
    header = 'w' + str(i)
    
    select_cols = []
    for i in cesd_col_names:
        select_cols.append(header + i)

    # Drop rows where Interview Outcome is 'Refused/Not Available'
    outcome_str = header + '_a_outcome'
    new_data = data[data[outcome_str] == 'Successfully Interviewed']

    '''
    In some cases, despite accepting the interview, there are 3 cases leading to incomplete data:
        1. Some participants might have refused certain questions (Refused)
        2. Some participants might not have placed answers on the survey (Missing)
        3. Some participants outlined that they did not have an answer that aligned with any of the options (Don't know)

    These participants will be dropped from the dataset
    '''

    cesd_valid_answers = ['Rarely or none of the time (less than 1 day)',
                          'Some or little of the time (1-2 days)',
                          'Occasionally or a moderate amount of time (3-4 days)',
                          'All of the time (5-7 days)']
    count = 1
    for i in select_cols:
        # new_data = new_data[new_data[i] != 'Missing']
        # new_data = new_data[new_data[i] != 'Refused']
        # new_data = new_data[new_data[i] != "Don't know"]
        new_data = new_data[new_data[i].isin(cesd_valid_answers)]
        new_data.dropna()

        # Questions 5 and 8 have reverse scoring
        if (count == 5 or count == 8):
            new_data[i] = new_data[i].replace('Rarely or none of the time (less than 1 day)', 3)
            new_data[i] = new_data[i].replace('Some or little of the time (1-2 days)', 2)
            new_data[i] = new_data[i].replace('Occasionally or a moderate amount of time (3-4 days)', 1)
            new_data[i] = new_data[i].replace('All of the time (5-7 days)', 0)
        else:
            new_data[i] = new_data[i].replace('Rarely or none of the time (less than 1 day)', 0)
            new_data[i] = new_data[i].replace('Some or little of the time (1-2 days)', 1)
            new_data[i] = new_data[i].replace('Occasionally or a moderate amount of time (3-4 days)', 2)
            new_data[i] = new_data[i].replace('All of the time (5-7 days)', 3)
        
        count += 1
        
    wave = Wave(new_data, select_cols)
    waves.append(wave)

wave1_select.csv


### Labelling

Each participant was **labelled according to the CESD-10 reporting scale**.

In [17]:
from tqdm import tqdm
import pandas as pd

# Selecting column names for CESD-10 Scale related features
cesd_col_names = ["_a_emobth", "_a_emomnd", "_a_emodep", "_a_emoeff", "_a_emohope",
                "_a_emofear", "_a_emoslp", "_a_emohap", "_a_emolone", "_a_emogo"]

for wave in range(len(waves)):
    print(f"Wave {wave+1}:")
    data: pd.DataFrame = waves[wave].data
    select_cols = waves[wave].get_select_cols()

    # Dictionary to store the scores and depression status
    scores: Dict[str, Dict] = dict()

    # Series containing participant IDs
    participants = data['pid']

    # Counter for the number of participants flagged as depressed
    count_depressed = 0

    # Iterate over each participant
    for participant in tqdm(participants, desc="Labelling Participants"):
        score = 0
        depressed = False

        idx = data.index[data['pid'] == participant]

        # Sum the scores for all relevant columns
        for col in select_cols:
            value = data.at[idx[0], col]  # Accessing the value using participant ID and column name
            score += value

        # Determine if the participant is depressed based on the score
        if score >= 10:
            depressed = True
            count_depressed += 1

        # Map the participant ID to their score and depression status
        scores[participant] = {'score': int(score), 'Depressed': depressed}
    
    data['depression_score'] = 0

    for i in cesd_col_names:
        data['depression_score'] += data[f"w{wave+1}{i}"]
    
    # Create a new column 'new_column' based on a condition
    data['depressed'] = data['depression_score'].apply(lambda x: 1 if x >= 10 else 0)

    print(list(data['depressed']).count(1))

    data.to_csv('wave1_select_labelled.csv')
    # Print the total number of depressed participants
    print(f"Total Depressed Participants: {count_depressed}")
    print(f"Calculated Prevalence for Depression: {round(count_depressed/len(participants) * 100, 2)}%")



Wave 1:


Labelling Participants: 100%|██████████| 15342/15342 [00:02<00:00, 5285.84it/s]


5185
Total Depressed Participants: 5185
Calculated Prevalence for Depression: 33.8%


# Exploratory Data Analysis and Visualisation

# Models

The current models being implemented are namely:

* Supervised
    1. Logistic Regression Model
    2. Random Forest Classifier
    3. Support Vector Machine
    4. Deep Neural Network
    5. (New) Bayesian Network
    6. (New) Gradient Boosting

* Unsupervised
    1. (New) K-Means Clustering
    2. (New) Kernel Density Estimation

In [18]:
df = pd.read_csv('wave1_select_labelled.csv')

# First, separate the target variable (label) from the features
# Assuming you have a target column in your original dataframe called 'depression_label' that indicates depression status
# If not, make sure to create that before splitting
y = df['depressed']  # Replace with your actual target column name
X = df.drop(columns=['depressed', 'w1_a_outcome', 'pid'])  # The features you've created

# Now split the data into train+validation and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now split the train+validation set into actual train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

print(len(X_train))

# This gives you 60% train, 20% validation, and 20% test splits

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)

# Train the model on the training data
log_reg.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = log_reg.predict(X_val)

# Predict on the test set
y_test_pred = log_reg.predict(X_test)

# Accuracy
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.2f}")

# Confusion Matrix
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
print("Validation Confusion Matrix:")
print(val_conf_matrix)

# Classification Report
val_class_report = classification_report(y_val, y_val_pred)
print("Validation Classification Report:")
print(val_class_report)

# Accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Confusion Matrix
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
print("Test Confusion Matrix:")
print(test_conf_matrix)

# Classification Report
test_class_report = classification_report(y_test, y_test_pred)
print("Test Classification Report:")
print(test_class_report)

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
 
# Create a pipeline with scaler and logistic regression
pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, solver='saga', tol=0.1))
 
# Create a parameter grid
param_grid = {
    'logisticregression__C': [0.1, 1, 10, 100],
    'logisticregression__penalty': ['l1', 'l2']
}
 
# Create GridSearchCV object
grid_search = GridSearchCV(pipe, param_grid, cv=5)
 
# Fit the model
grid_search.fit(X_train, y_train)
 
# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


9204


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation Accuracy: 1.00
Validation Confusion Matrix:
[[2020    2]
 [   4 1043]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2022
           1       1.00      1.00      1.00      1047

    accuracy                           1.00      3069
   macro avg       1.00      1.00      1.00      3069
weighted avg       1.00      1.00      1.00      3069

Test Accuracy: 1.00
Test Confusion Matrix:
[[2030    2]
 [   5 1032]]
Test Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2032
           1       1.00      1.00      1.00      1037

    accuracy                           1.00      3069
   macro avg       1.00      1.00      1.00      3069
weighted avg       1.00      1.00      1.00      3069

Best Parameters: {'logisticregression__C': 0.1, 'logisticregression__penalty': 'l1'}
Best Score: 1.0
