### Training the Decision Tree Based Model

In [12]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [13]:
# Loading the preprocessed dataset

file_path = os.path.join(os.getcwd(), "Data", "processed", "FC110572_MethuliMenya","preprocessed_data.csv")
processed = pd.read_csv(file_path)

print(f"   Information of the processed dataset \n ===================================\n")
processed.info()

   Information of the processed dataset 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27788 entries, 0 to 27787
Data columns (total 13 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Gender                            27788 non-null  object 
 1   Age                               27788 non-null  float64
 2   Academic Pressure                 27788 non-null  float64
 3   CGPA                              27788 non-null  float64
 4   Study Satisfaction                27788 non-null  float64
 5   Suicidal Thoughts                 27788 non-null  int64  
 6   Work/Study Hours                  27788 non-null  float64
 7   Financial Stress                  27788 non-null  float64
 8   Family History of Mental Illness  27788 non-null  int64  
 9   Depression                        27788 non-null  int64  
 10  SleepScore                        27788 non-null  int64  
 11  DietScore                

In [14]:
# Convert all object dDtype columns to categorical Dtype
for col in processed.select_dtypes(include='object').columns:
    processed[col] = processed[col].astype('category')

processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27788 entries, 0 to 27787
Data columns (total 13 columns):
 #   Column                            Non-Null Count  Dtype   
---  ------                            --------------  -----   
 0   Gender                            27788 non-null  category
 1   Age                               27788 non-null  float64 
 2   Academic Pressure                 27788 non-null  float64 
 3   CGPA                              27788 non-null  float64 
 4   Study Satisfaction                27788 non-null  float64 
 5   Suicidal Thoughts                 27788 non-null  int64   
 6   Work/Study Hours                  27788 non-null  float64 
 7   Financial Stress                  27788 non-null  float64 
 8   Family History of Mental Illness  27788 non-null  int64   
 9   Depression                        27788 non-null  int64   
 10  SleepScore                        27788 non-null  int64   
 11  DietScore                         27788 non-null  int6

#### 1. Splitting Data to Isolate test data

In [15]:
features = processed.columns.drop('Depression')
target = "Depression"

# Split the preprocessed data so, 85% data goes to train and the remaining 15% for the testing.
X_train_full, X_test, y_train_full, y_test = train_test_split(processed[features], processed[target], test_size=0.15, stratify=processed[target], random_state=123)

# Split the train data set into 85% as train data and 15% as validation data
X_train, X_val, y_train, y_val = train_test_split(X_train_full, processed[target], test_size=0.15, stratify=processed[target], random_state=123)

In [16]:
train_depressed_pct = (y_train_full.value_counts(normalize=True)[1]) * 100
test_depressed_pct = (y_test.value_counts(normalize=True)[1]) * 100

print(f"Depressed percentage in training set: {train_depressed_pct:.2f}%")
print(f"Depressed percentage in test set: {test_depressed_pct:.2f}%")

Depressed percentage in training set: 58.58%
Depressed percentage in test set: 58.58%


#### 2. Label Encoding the Gender and Degree columns

In [17]:
# Label Encoders
le_degree = LabelEncoder()
le_gender = LabelEncoder()

# Encoding training data
X_train_full['Degree_Cleaned'] = le_degree.fit_transform(X_train_full['Degree_Cleaned'])
X_train_full['Gender'] = le_gender.fit_transform(X_train_full['Gender'])

# Encoding test data
X_test['Degree_Cleaned'] = le_degree.transform(X_test['Degree_Cleaned'])
X_test['Gender'] = le_gender.transform(X_test['Gender'])