In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import warnings

warnings.filterwarnings('ignore')
sns.set(style="whitegrid")


### Load the Preprocessed Data

In [33]:

df = pd.read_csv('Data/processed/FC110568_Dilini/preprocessed_data.csv')

print("DataFrame Dimensions")
print("------------------------")
print(f"   Rows   : {df.shape[0]}")
print(f"   Columns: {df.shape[1]}")

DataFrame Dimensions
------------------------
   Rows   : 27889
   Columns: 13


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27889 entries, 0 to 27888
Data columns (total 13 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Gender                                 27889 non-null  object 
 1   Age                                    27889 non-null  float64
 2   Academic Pressure                      27889 non-null  float64
 3   CGPA                                   27889 non-null  float64
 4   Study Satisfaction                     27889 non-null  float64
 5   Sleep Duration                         27889 non-null  float64
 6   Dietary Habits                         27889 non-null  object 
 7   Degree                                 27889 non-null  object 
 8   Have you ever had suicidal thoughts ?  27889 non-null  object 
 9   Work/Study Hours                       27889 non-null  float64
 10  Financial Stress                       27889 non-null  float64
 11  Fa

In [35]:
# Convert all object columns to category dtype in df
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype('category')

# Check dtypes to confirm
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27889 entries, 0 to 27888
Data columns (total 13 columns):
 #   Column                                 Non-Null Count  Dtype   
---  ------                                 --------------  -----   
 0   Gender                                 27889 non-null  category
 1   Age                                    27889 non-null  float64 
 2   Academic Pressure                      27889 non-null  float64 
 3   CGPA                                   27889 non-null  float64 
 4   Study Satisfaction                     27889 non-null  float64 
 5   Sleep Duration                         27889 non-null  float64 
 6   Dietary Habits                         27889 non-null  category
 7   Degree                                 27889 non-null  category
 8   Have you ever had suicidal thoughts ?  27889 non-null  category
 9   Work/Study Hours                       27889 non-null  float64 
 10  Financial Stress                       27889 non-null  flo

## Encoding

For machine learning, we need numeric features. So the Encoding is the process of converting text into numbers.

**Ordinal Encoding**

 The 'Degree' feature has a natural ordinal order (e.g., 'Class 12' < 'BA' < 'BSc' < 'BCA' < 'B.Pharm' < 'M.Tech' < 'MSc' < 'MD'), so it will be encoded using ordinal encoding to preserve this order for machine learning models.

 - School (Class 12) -> 0
- Bachelor's (B.*) -> 1
- Master's (M.*) -> 2
- Doctorate (PhD) -> 3
- Other -> 4

In [36]:
# Define updated function to group degrees more accurately
def simplify_degree(degree):
    if pd.isnull(degree):
        return 'Other'
    
    degree = degree.strip().lower()

    # School level
    if 'class 12' in degree:
        return 'School'
    
    # Doctorate level
    elif 'phd' in degree:
        return 'Doctorate'

    # Master’s level
    elif degree.startswith('m') or degree in ['msc', 'm.tech', 'mca', 'mba', 'm.ed', 'mhm']:
        return 'Master'

    # Bachelor's level
    elif degree.startswith('b') or degree in ['bsc', 'ba', 'bca', 'bba', 'b.com', 'b.tech', 'b.ed', 'b.arch', 'b.pharm']:
        return 'Bachelor'

    # Everything else
    else:
        return 'Other'

# Apply function
df['Degree_Grouped'] = df['Degree'].apply(simplify_degree)

#  Map to ordinal encoding
degree_order = {
    'School': 0,
    'Bachelor': 1,
    'Master': 2,
    'Doctorate': 3,
    'Other': 4
}
df['Degree_Encoded'] = df['Degree_Grouped'].map(degree_order)

# Preview result
print(df[['Degree', 'Degree_Grouped', 'Degree_Encoded']].head(20))


        Degree Degree_Grouped  Degree_Encoded
0      B.Pharm       Bachelor               1
1          BSc       Bachelor               1
2           BA       Bachelor               1
3          BCA       Bachelor               1
4       M.Tech         Master               2
5          PhD      Doctorate               3
6          BSc       Bachelor               1
7   'Class 12'         School               0
8         B.Ed       Bachelor               1
9          LLB          Other               4
10  'Class 12'         School               0
11          BE       Bachelor               1
12      M.Tech         Master               2
13  'Class 12'         School               0
14  'Class 12'         School               0
15      M.Tech         Master               2
16        M.Ed         Master               2
17  'Class 12'         School               0
18  'Class 12'         School               0
19        B.Ed       Bachelor               1


In [39]:
df.drop(columns=['Degree', 'Degree_Grouped'], inplace=True)

**One - hot Encoding**

In [40]:
# One-hot encode all categorical columns except 'Degree'
categorical_to_encode = ['Gender', 'Dietary Habits', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']

df_encoded = pd.get_dummies(df, columns=categorical_to_encode, drop_first=True)

print(df_encoded.head())

    Age  Academic Pressure  CGPA  Study Satisfaction  Sleep Duration  \
0  33.0                5.0  8.97                 2.0             5.5   
1  24.0                2.0  5.90                 5.0             5.5   
2  31.0                3.0  7.03                 5.0             5.0   
3  28.0                3.0  5.59                 2.0             7.5   
4  25.0                4.0  8.13                 3.0             5.5   

   Work/Study Hours  Financial Stress  Depression  Degree_Encoded  \
0               3.0               1.0           1               1   
1               3.0               2.0           0               1   
2               9.0               1.0           0               1   
3               4.0               5.0           1               1   
4               1.0               1.0           0               2   

   Gender_Male  Dietary Habits_Moderate  Dietary Habits_Unhealthy  \
0         True                    False                     False   
1        False

In [38]:
print(df.isnull().sum())

Gender                                   0
Age                                      0
Academic Pressure                        0
CGPA                                     0
Study Satisfaction                       0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
Degree_Grouped                           0
Degree_Encoded                           0
dtype: int64
