## Import Initial Libararies

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import random
import warnings

import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import squarify
%matplotlib inline

from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from category_encoders import TargetEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, HistGradientBoostingClassifier

warnings.filterwarnings("ignore")

## Import Data

In [2]:
df_train = pd.read_csv("/kaggle/input/playground-series-s4e11/train.csv")
df_test = pd.read_csv("/kaggle/input/playground-series-s4e11/test.csv")

## General View of The Data

In [3]:
df_train.head()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [4]:
df_train.tail()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
140695,140695,Vidya,Female,18.0,Ahmedabad,Working Professional,,,5.0,,,4.0,5-6 hours,Unhealthy,Class 12,No,2.0,4.0,Yes,1
140696,140696,Lata,Female,41.0,Hyderabad,Working Professional,Content Writer,,5.0,,,4.0,7-8 hours,Moderate,B.Tech,Yes,6.0,5.0,Yes,0
140697,140697,Aanchal,Female,24.0,Kolkata,Working Professional,Marketing Manager,,3.0,,,1.0,More than 8 hours,Moderate,B.Com,No,4.0,4.0,No,0
140698,140698,Prachi,Female,49.0,Srinagar,Working Professional,Plumber,,5.0,,,2.0,5-6 hours,Moderate,ME,Yes,10.0,1.0,No,0
140699,140699,Sai,Male,27.0,Patna,Student,,4.0,,9.24,1.0,,Less than 5 hours,Healthy,BCA,Yes,2.0,3.0,Yes,1


In [5]:
df_train.columns

Index(['id', 'Name', 'Gender', 'Age', 'City',
       'Working Professional or Student', 'Profession', 'Academic Pressure',
       'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',
       'Sleep Duration', 'Dietary Habits', 'Degree',
       'Have you ever had suicidal thoughts ?', 'Work/Study Hours',
       'Financial Stress', 'Family History of Mental Illness', 'Depression'],
      dtype='object')

In [6]:
df_train.describe()

Unnamed: 0,id,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Work/Study Hours,Financial Stress,Depression
count,140700.0,140700.0,27897.0,112782.0,27898.0,27897.0,112790.0,140700.0,140696.0,140700.0
mean,70349.5,40.388621,3.142273,2.998998,7.658636,2.94494,2.974404,6.252679,2.988983,0.181713
std,40616.735775,12.384099,1.380457,1.405771,1.464466,1.360197,1.416078,3.853615,1.413633,0.385609
min,0.0,18.0,1.0,1.0,5.03,1.0,1.0,0.0,1.0,0.0
25%,35174.75,29.0,2.0,2.0,6.29,2.0,2.0,3.0,2.0,0.0
50%,70349.5,42.0,3.0,3.0,7.77,3.0,3.0,6.0,3.0,0.0
75%,105524.25,51.0,4.0,4.0,8.92,4.0,4.0,10.0,4.0,0.0
max,140699.0,60.0,5.0,5.0,10.0,5.0,5.0,12.0,5.0,1.0


In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140700 entries, 0 to 140699
Data columns (total 20 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   id                                     140700 non-null  int64  
 1   Name                                   140700 non-null  object 
 2   Gender                                 140700 non-null  object 
 3   Age                                    140700 non-null  float64
 4   City                                   140700 non-null  object 
 5   Working Professional or Student        140700 non-null  object 
 6   Profession                             104070 non-null  object 
 7   Academic Pressure                      27897 non-null   float64
 8   Work Pressure                          112782 non-null  float64
 9   CGPA                                   27898 non-null   float64
 10  Study Satisfaction                     27897 non-null   

## Data Renaming

In [8]:
rename_dict = {
    "Working Professional or Student": "Employment",
    "Have you ever had suicidal thoughts ?": "Suicidal thoughts",
    "Work/Study Hours": "WS hours",
    "Family History of Mental Illness": "Family illness"
}

df_train.rename(columns=rename_dict, inplace=True)
df_test.rename(columns=rename_dict, inplace=True)

## Data Unification

In [9]:
columns = ["Sleep Duration", "Dietary Habits", "Gender", "Employment", "Suicidal thoughts", "Depression", "Family illness"]

for df in [df_train, df_test]:
    if 'Name' in df.columns:
        df.drop(["Name"], axis=1, inplace=True)
    
    for col in columns:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip().str.lower()

columns_to_clean = ['City', 'Profession', 'Sleep Duration', 'Dietary Habits', 'Degree']

for col in columns_to_clean:
    value_counts = df_train[col].value_counts()
    rare_values = value_counts[value_counts < 13].index

    for df in [df_train, df_test]:
        df[col] = df[col].apply(lambda x: np.nan if x in rare_values else x)

## Data Type Transformation

In [10]:
for df in [df_train, df_test]:
    num_cols = df.select_dtypes(include=['int64', 'int32', 'float64']).columns

    for col in num_cols:
        if col not in ["CGPA", "id"]:
            col_min = df[col].min()
            col_max = df[col].max()

            if -32768 <= col_min and col_max <= 32767:
                if df[col].dtype == 'float64':
                    df[col] = df[col].round().astype("Int16")
                else:
                    df[col] = df[col].astype("Int16")

## Descretization

In [11]:
sleep_map = {
    "7-8 hours": 1,
    "more than 8 hours": 2,
    "5-6 hours": 3,
    "less than 5 hours": 4
}

diet_map = {
    "healthy": 1,
    "moderate": 2,
    "unhealthy": 3
}

gender_map = {"male": 0, "female": 1}
employment_map = {"working professional": 1, "student": 0}
yes_no_map = {"no": 0, "yes": 1}

WS_hours = {
    0: 1,
    1: 2, 2: 2, 3: 2, 4: 3, 5: 3, 6: 3, 
    7: 4, 8: 4, 9: 4, 10: 5, 11: 5, 12: 5
}

education_levels = {
    "Class 12": 1,
    "BA": 2, "BSc": 2, "B.Com": 2, "B.Ed": 2, "BCA": 2, "B.Arch": 2, 
    "BBA": 2, "BHM": 2, "B.Pharm": 2, "BE": 2, "B.Tech": 2,
    "MA": 3, "MSc": 3, "M.Com": 3, "M.Ed": 3, "MCA": 3,
    "M.Tech": 3, "MBA": 3, "M.Pharm": 3, "ME": 3, "MHM": 3,
    "MD": 4, "LLB": 4, "LLM": 4, "MBBS": 4, "PhD": 4
}

for df in [df_train, df_test]:
    if "Sleep Duration" in df.columns:
        df["Sleep Duration"] = df["Sleep Duration"].map(sleep_map).astype("Int16")
    
    if "Dietary Habits" in df.columns:
        df["Dietary Habits"] = df["Dietary Habits"].map(diet_map).astype("Int16")
    
    if "Gender" in df.columns:
        df["Gender"] = df["Gender"].map(gender_map)
    
    if "Employment" in df.columns:
        df["Employment"] = df["Employment"].map(employment_map)
    
    if "Suicidal thoughts" in df.columns:
        df["Suicidal thoughts"] = df["Suicidal thoughts"].map(yes_no_map)
    
    if "Family illness" in df.columns:
        df["Family illness"] = df["Family illness"].map(yes_no_map)


for df in [df_train, df_test]:
    if "Degree" in df.columns:
        df["Degree"] = df["Degree"].map(education_levels).astype("Int16")

def assign_class(grade):
    if grade == 0:
        return 0
    elif 5.00 <= grade <= 5.99:
        return 1
    elif 6.00 <= grade <= 6.99:
        return 2
    elif 7.00 <= grade <= 7.99:
        return 3
    elif 8.00 <= grade <= 8.99:
        return 4
    elif 9.00 <= grade <= 10.00:
        return 5
    else:
        return None  


for df in [df_train, df_test]:
    
    if "WS hours" in df.columns:
        df["WS hours"] = df["WS hours"].map(WS_hours)
    
    if "CGPA" in df.columns:
        df["CGPA"] = df["CGPA"].fillna(0)
        df["CGPA"] = df["CGPA"].apply(assign_class)

## Cheking Record of The Features

In [12]:
df_train["Depression"].value_counts()

Depression
0    115133
1     25567
Name: count, dtype: int64

In [13]:
df_train["Depression"].value_counts(normalize = True)

Depression
0    0.818287
1    0.181713
Name: proportion, dtype: float64

In [14]:
print("There are {} duplicates in the dataset.".format(df_train.duplicated().sum()))

There are 0 duplicates in the dataset.


In [15]:
print("Checking for missing values in each column:")
print(df_train.isnull().sum())

Checking for missing values in each column:
id                         0
Gender                     0
Age                        0
City                      98
Employment                 0
Profession             36680
Academic Pressure     112803
Work Pressure          27918
CGPA                       0
Study Satisfaction    112803
Job Satisfaction       27910
Sleep Duration            79
Dietary Habits            27
Degree                   116
Suicidal thoughts          0
WS hours                   0
Financial Stress           4
Family illness             0
Depression                 0
dtype: int64


In [16]:
test_ids = df_test['id']

df_train = df_train.drop(['id'], axis=1)
df_test = df_test.drop(['id'], axis=1)

target_column = 'Depression'

categorical_columns = df_train.select_dtypes(include=['object']).columns
numerical_columns = df_train.select_dtypes(exclude=['object']).columns

print("Target Column:", target_column)
print("\nCategorical Columns:", categorical_columns.tolist())
print("\nNumerical Columns:", numerical_columns.tolist())

Target Column: Depression

Categorical Columns: ['City', 'Profession', 'Depression']

Numerical Columns: ['Gender', 'Age', 'Employment', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Suicidal thoughts', 'WS hours', 'Financial Stress', 'Family illness']


In [17]:
for col in df_train.columns:
    print(f"Columns: {col}")
    print(df_train[col].unique())
    print("-" * 33) 

Columns: Gender
[1 0]
---------------------------------
Columns: Age
<IntegerArray>
[49, 26, 33, 22, 30, 59, 47, 38, 24, 42, 55, 51, 39, 29, 50, 23, 56, 45, 37,
 46, 31, 19, 28, 25, 41, 60, 18, 36, 21, 58, 44, 43, 40, 35, 54, 27, 52, 48,
 57, 53, 34, 20, 32]
Length: 43, dtype: Int16
---------------------------------
Columns: City
['Ludhiana' 'Varanasi' 'Visakhapatnam' 'Mumbai' 'Kanpur' 'Ahmedabad'
 'Thane' 'Nashik' 'Bangalore' 'Patna' 'Rajkot' 'Jaipur' 'Pune' 'Lucknow'
 'Meerut' 'Agra' 'Surat' 'Faridabad' 'Hyderabad' 'Srinagar' 'Ghaziabad'
 'Kolkata' 'Chennai' 'Kalyan' 'Nagpur' 'Vadodara' 'Vasai-Virar' 'Delhi'
 'Bhopal' 'Indore' nan]
---------------------------------
Columns: Employment
[1 0]
---------------------------------
Columns: Profession
['Chef' 'Teacher' nan 'Business Analyst' 'Finanancial Analyst' 'Chemist'
 'Electrician' 'Software Engineer' 'Data Scientist' 'Plumber'
 'Marketing Manager' 'Accountant' 'Entrepreneur' 'HR Manager'
 'UX/UI Designer' 'Content Writer' 'Educational

In [18]:
for column in categorical_columns:
    print(f"\nValue counts in '{column}':\n{df_train[column].value_counts().head(50)}")


Value counts in 'City':
City
Kalyan           6591
Patna            5924
Vasai-Virar      5765
Kolkata          5689
Ahmedabad        5613
Meerut           5528
Ludhiana         5226
Pune             5210
Rajkot           5207
Visakhapatnam    5176
Srinagar         5074
Mumbai           4966
Indore           4872
Agra             4684
Surat            4636
Varanasi         4606
Vadodara         4568
Hyderabad        4496
Kanpur           4398
Jaipur           4328
Thane            4289
Lucknow          4280
Nagpur           4209
Bangalore        4123
Chennai          4044
Ghaziabad        3620
Delhi            3593
Bhopal           3475
Faridabad        3268
Nashik           3144
Name: count, dtype: int64

Value counts in 'Profession':
Profession
Teacher                   24906
Content Writer             7814
Architect                  4370
Consultant                 4229
HR Manager                 4022
Pharmacist                 3893
Doctor                     3255
Business Analyst  

## Anomaly & Missing Handling

In [19]:
print("Checking for missing values in each column:")
print(df_train.isnull().sum())

Checking for missing values in each column:
Gender                     0
Age                        0
City                      98
Employment                 0
Profession             36680
Academic Pressure     112803
Work Pressure          27918
CGPA                       0
Study Satisfaction    112803
Job Satisfaction       27910
Sleep Duration            79
Dietary Habits            27
Degree                   116
Suicidal thoughts          0
WS hours                   0
Financial Stress           4
Family illness             0
Depression                 0
dtype: int64


In [20]:
print("Checking for missing values in each column:")
print(df_test.isnull().sum())

Checking for missing values in each column:
Gender                    0
Age                       0
City                     20
Employment                0
Profession            24656
Academic Pressure     75033
Work Pressure         18778
CGPA                      0
Study Satisfaction    75033
Job Satisfaction      18774
Sleep Duration           54
Dietary Habits           30
Degree                   86
Suicidal thoughts         0
WS hours                  0
Financial Stress          0
Family illness            0
dtype: int64


In [21]:
students_with_profession = df_train[(df_train["Employment"] == 0) & (df_train["Profession"].notna())]
print("Number of students who have a profession:", len(students_with_profession))

Number of students who have a profession: 31


In [22]:
students_with_profession = df_test[(df_test["Employment"] == 0) & (df_test["Profession"].notna())]
print("Number of students who have a profession:", len(students_with_profession))


Number of students who have a profession: 24


In [23]:
for df in [df_train, df_test]:
    mask = (df["Employment"] == 0) & (df["Profession"].notna())
    df.loc[mask, "Profession"] = np.nan

In [24]:
# i label encode this for 2 dataset for reason 
# in missing handling part you will see
professions = [
    "Teacher", "Content Writer", "Architect", "Consultant", "Pharmacist", "HR Manager",
    "Doctor", "Business Analyst", "Chemist", "Entrepreneur", "Chef", "Educational Consultant",
    "Data Scientist", "Lawyer", "Researcher", "Pilot", "Customer Support", "Marketing Manager",
    "Judge", "Travel Consultant", "Manager", "Sales Executive", "Plumber", "Electrician",
    "Financial Analyst", "Software Engineer", "Digital Marketer", "Civil Engineer",
    "UX/UI Designer", "Finanancial Analyst", "Accountant", "Mechanical Engineer",
    "Graphic Designer", "Research Analyst", "Investment Banker"
]

profession_mapping = {profession: idx for idx, profession in enumerate(professions)}

for df in [df_train, df_test]:
    df["Profession"] = df["Profession"].map(profession_mapping)
print("Jobs count")
for job, code in profession_mapping.items():
    print(f"{job}: {code}")

Jobs count
Teacher: 0
Content Writer: 1
Architect: 2
Consultant: 3
Pharmacist: 4
HR Manager: 5
Doctor: 6
Business Analyst: 7
Chemist: 8
Entrepreneur: 9
Chef: 10
Educational Consultant: 11
Data Scientist: 12
Lawyer: 13
Researcher: 14
Pilot: 15
Customer Support: 16
Marketing Manager: 17
Judge: 18
Travel Consultant: 19
Manager: 20
Sales Executive: 21
Plumber: 22
Electrician: 23
Financial Analyst: 24
Software Engineer: 25
Digital Marketer: 26
Civil Engineer: 27
UX/UI Designer: 28
Finanancial Analyst: 29
Accountant: 30
Mechanical Engineer: 31
Graphic Designer: 32
Research Analyst: 33
Investment Banker: 34


In [25]:
for df in [df_train, df_test]:
    df["Profession"] = df["Profession"].astype("object")

    df.loc[(df["Employment"] == 0) & (df["Profession"].isna()), "Profession"] = 35

    df.loc[df["Employment"] == 1, ["Academic Pressure", "Study Satisfaction"]] = 0 

    df.loc[df["Employment"] == 0, ["Work Pressure", "Job Satisfaction"]] = 0

In [26]:
print("Checking for missing values in each column:")
print(df_train.isnull().sum())

Checking for missing values in each column:
Gender                   0
Age                      0
City                    98
Employment               0
Profession            8810
Academic Pressure        9
Work Pressure           20
CGPA                     0
Study Satisfaction      10
Job Satisfaction        17
Sleep Duration          79
Dietary Habits          27
Degree                 116
Suicidal thoughts        0
WS hours                 0
Financial Stress         4
Family illness           0
Depression               0
dtype: int64


In [27]:
print("Checking for missing values in each column:")
print(df_test.isnull().sum())

Checking for missing values in each column:
Gender                   0
Age                      0
City                    20
Employment               0
Profession            5926
Academic Pressure        7
Work Pressure           10
CGPA                     0
Study Satisfaction       8
Job Satisfaction         9
Sleep Duration          54
Dietary Habits          30
Degree                  86
Suicidal thoughts        0
WS hours                 0
Financial Stress         0
Family illness           0
dtype: int64


* People who have job but they didnt enter their profession in train dataset

In [28]:
df_train.loc[df_train["Profession"].isna(), ["Employment"]].value_counts()

Employment
1             8810
Name: count, dtype: int64

* People who have job but they didnt enter their profession in test dataset

In [29]:
df_test.loc[df_test["Profession"].isna(), ["Employment"]].value_counts()

Employment
1             5926
Name: count, dtype: int64

In [30]:
np.random.seed(369)
random.seed(369)

for df in [df_train, df_test]:
    df["Profession"] = df["Profession"].astype("Int16")

    probabilities = df.loc[(df['Profession'].notna()) & (df['Profession'] != 35), 'Profession'].value_counts(normalize=True)

    missing_idx = df[df['Profession'].isna()].index

    df.loc[missing_idx, 'Profession'] = np.random.choice(
        probabilities.index, size=len(missing_idx), p=probabilities.values
    )

    df["Profession"] = df["Profession"].astype("Int16")

In [31]:
print("Checking for missing values in each column:")
print(df_train.isnull().sum())
print(df_test.isnull().sum())

Checking for missing values in each column:
Gender                  0
Age                     0
City                   98
Employment              0
Profession              0
Academic Pressure       9
Work Pressure          20
CGPA                    0
Study Satisfaction     10
Job Satisfaction       17
Sleep Duration         79
Dietary Habits         27
Degree                116
Suicidal thoughts       0
WS hours                0
Financial Stress        4
Family illness          0
Depression              0
dtype: int64
Gender                 0
Age                    0
City                  20
Employment             0
Profession             0
Academic Pressure      7
Work Pressure         10
CGPA                   0
Study Satisfaction     8
Job Satisfaction       9
Sleep Duration        54
Dietary Habits        30
Degree                86
Suicidal thoughts      0
WS hours               0
Financial Stress       0
Family illness         0
dtype: int64


In [32]:
columns_to_fill = [
    "City", 
    "Work Pressure", 
    "Study Satisfaction", 
    "Job Satisfaction", 
    "Sleep Duration", 
    "Dietary Habits", 
    "Degree",
    "Financial Stress",
    "Academic Pressure"
]

for df in [df_train, df_test]:
    for col in columns_to_fill:
        if col in df.columns:
            mode_val = df[col].mode(dropna=True)
            if not mode_val.empty:
                df[col].fillna(mode_val[0], inplace=True)

In [33]:
print("Checking for missing values in each column:")
print(df_train.isnull().sum())
print(df_test.isnull().sum())

Checking for missing values in each column:
Gender                0
Age                   0
City                  0
Employment            0
Profession            0
Academic Pressure     0
Work Pressure         0
CGPA                  0
Study Satisfaction    0
Job Satisfaction      0
Sleep Duration        0
Dietary Habits        0
Degree                0
Suicidal thoughts     0
WS hours              0
Financial Stress      0
Family illness        0
Depression            0
dtype: int64
Gender                0
Age                   0
City                  0
Employment            0
Profession            0
Academic Pressure     0
Work Pressure         0
CGPA                  0
Study Satisfaction    0
Job Satisfaction      0
Sleep Duration        0
Dietary Habits        0
Degree                0
Suicidal thoughts     0
WS hours              0
Financial Stress      0
Family illness        0
dtype: int64


## Pair plot before modeling

In [34]:
df_train.head()

Unnamed: 0,Gender,Age,City,Employment,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Suicidal thoughts,WS hours,Financial Stress,Family illness,Depression
0,1,49,Ludhiana,1,10,0,5,0,0,2,2,1,2,0,2,2,0,0
1,0,26,Varanasi,1,0,0,4,0,0,3,4,3,4,1,4,3,0,1
2,0,33,Visakhapatnam,0,35,5,0,4,2,0,3,1,2,1,2,1,0,1
3,0,22,Mumbai,1,0,0,5,0,0,1,4,2,2,1,5,1,1,1
4,1,30,Kanpur,1,7,0,1,0,0,1,3,3,2,1,4,4,1,0


In [35]:
# df_train.drop(["City"], axis = 1, inplace = True)

In [36]:
# plt.figure(figsize = (12,10), dpi = 80)

# corr = df_train.corr()

# sns.heatmap(corr, cmap = "coolwarm",
#            annot = True, fmt=".2f")

# plt.title("Correlogram of patient", fontsize = 22)
# plt.xticks(fontsize = 12)
# plt.yticks(fontsize = 12)
# plt.show()

## Encoding

In [37]:
encoder = TargetEncoder(cols=['City', 'Profession'])

y_train = df_train['Depression'].astype(float)

df_train[['City_encoded', 'Profession_encoded']] = encoder.fit_transform(
    df_train[['City', 'Profession']], 
    y_train
)[['City', 'Profession']]

df_test[['City_encoded', 'Profession_encoded']] = encoder.transform(
    df_test[['City', 'Profession']]
)[['City', 'Profession']]

df_train = df_train.drop(['City', 'Profession'], axis=1)
df_test = df_test.drop(['City', 'Profession'], axis=1)

## Train Test

In [38]:
X_train = df_train.drop('Depression', axis=1)
y_train = df_train['Depression']
X_test = df_test.copy()

In [39]:
numerical_columns = X_train.select_dtypes(include=['float64', 'int64', 'int16']).columns.tolist()

In [40]:
numerical_columns

['Gender',
 'Age',
 'Employment',
 'Academic Pressure',
 'Work Pressure',
 'CGPA',
 'Study Satisfaction',
 'Job Satisfaction',
 'Sleep Duration',
 'Dietary Habits',
 'Degree',
 'Suicidal thoughts',
 'WS hours',
 'Financial Stress',
 'Family illness',
 'City_encoded',
 'Profession_encoded']

In [41]:
binary_cols = ['Gender', 'Employment', 'Suicidal thoughts', 'Family illness']

scale_cols = [c for c in X_train.columns if c not in binary_cols]

scaler = StandardScaler()
X_train[scale_cols] = scaler.fit_transform(X_train[scale_cols])
X_test[scale_cols] = scaler.transform(X_test[scale_cols])

# Modeling

## Ensemble

In [None]:
xgb_params = {
     'learning_rate': 0.298913248058474, 
     'max_depth': 9, 
     'min_child_weight': 3, 
     'n_estimators': 673, 
     'subsample': 0.5933970249700855, 
     'gamma': 2.597137534750985, 
     'reg_lambda': 0.11328048420927406, 
     'colsample_bytree': 0.1381203919800721
}

catboost_params = {
    'iterations': 145, 
    'depth': 7, 
    'learning_rate': 0.29930179265937246, 
    'l2_leaf_reg': 1.242352421942431, 
    'random_strength': 8.325681754379957, 
    'bagging_temperature': 0.7869848919618048, 
    'border_count': 139
}

hgb_params = {
    'learning_rate': 0.16299202834206894, 
    'max_iter': 250, 
    'max_depth': 4, 
    'l2_regularization': 7.1826466833939895,
    'early_stopping': True
}

xgb_model = XGBClassifier(**xgb_params, use_label_encoder=False, random_state=369)
catboost_model = CatBoostClassifier(**catboost_params, task_type="GPU", random_state=369, verbose=0)
hgb_model = HistGradientBoostingClassifier(**hgb_params, random_state=369)

stacking_ensemble = StackingClassifier(
    estimators=[
        ('catboost', catboost_model),
        ('xgb', xgb_model),
        ('hgb', hgb_model)
    ],
    final_estimator=LogisticRegression(),
    passthrough=False
)

scoring = make_scorer(accuracy_score)

cv_scores = cross_val_score(stacking_ensemble, X_train, y_train, cv=5, scoring=scoring)

print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation of CV Accuracy: {cv_scores.std():.4f}")

stacking_ensemble.fit(X_train, y_train)

y_hat_test = stacking_ensemble.predict(X_test)

submission = pd.DataFrame({'id': test_ids,
                       'class': y_hat_test})

submission.to_csv('/kaggle/working/submission.csv', index=False)

submission.head()

## Random Forest

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV
# import pandas as pd

# RF_model = RandomForestClassifier(
#     random_state=369,
#     class_weight='balanced'
# )

# param_grid_rf = {
#     'n_estimators': [500],
#     'criterion': ['entropy'],
#     'max_depth': [9],
#     'min_samples_split': [6],
#     'min_samples_leaf': [5],
#     'max_features': [None],
#     'max_leaf_nodes': [80]
# }

# grid_search_rf = GridSearchCV(
#     estimator=RF_model,
#     param_grid=param_grid_rf,
#     cv=5,
#     scoring='accuracy',
#     n_jobs=-1,
#     verbose=1
# )

# grid_search_rf.fit(X_train, y_train)

# y_hat_test = grid_search_rf.predict(X_test)

# submission = pd.DataFrame({
#     'id': test_ids,
#     'class': y_hat_test
# })

# submission.to_csv('submission4.csv', index=False)
# print(submission.head())

## Solo XGB

In [None]:
y_train = y_train.astype(int)

xgb_model = XGBClassifier(
    n_jobs = 1,
    objective = 'binary:logistic',
    eval_metric = 'logloss',
    random_state = 369, 
    reg_lambda = 1.0,
    reg_alpha = 0.2,
    max_depth = 3,
    min_child_weight = 7,
    subsample = 0.8,
    colsample_bytree = 0.8,
    learning_rate = 0.05,
    n_estimators = 500
)

xgb_model.fit(X_train, y_train, verbose = False)

y_hat_test = xgb_model.predict(X_test)

submission = pd.DataFrame({
    'id': test_ids,
    'class': y_hat_test
})

submission.to_csv('submission2.csv', index=False)
print(submission.head())

## Logistic Regression

In [None]:
# logreg = LogisticRegression(max_iter = 500, random_state = 369, class_weight = 'balanced')

# param_grid = {
#     'C': [500, 100],
#     'penalty': ['elasticnet'],
#     'solver': ['saga'],
#     'l1_ratio': [1.0]
# }

# grid_logreg = GridSearchCV(
#     estimator = logreg,
#     param_grid = param_grid,
#     cv = 5,
#     scoring = 'accuracy',
#     n_jobs = -1,
#     verbose = 1
# )

# grid_logreg.fit(X_train, y_train)

# print("Best Parameters:", grid_logreg.best_params_)
# print("Best CV Score:", grid_logreg.best_score_)

# y_hat_test = grid_logreg.predict(X_test)

# submission = pd.DataFrame({
#     'id': test_ids,
#     'class': y_hat_test
# })

# submission.to_csv('submission6.csv', index=False)
# print(submission.head())

## SVM

In [None]:
# svm = SVC(probability = True, random_state = 369, class_weight = 'balanced')

# param_grid_svm = {
#     'kernel': ['linear'],
#     'C': [10],
#     'gamma': ['scale']                
# }

# grid_search_svm = GridSearchCV(
#     estimator = svm,
#     param_grid = param_grid_svm,
#     cv = 10,
#     scoring = 'accuracy',
#     n_jobs = -1,
#     verbose = 1
# )

# grid_search_svm.fit(X_train, y_train)

# print("Best Parameters:", grid_search_svm.best_params_)
# print("Best CV Score:", grid_search_svm.best_score_)

# y_hat_test = grid_search_svm.predict(X_test)

# submission = pd.DataFrame({
#     'id': test_ids,
#     'class': y_hat_test
# })

# submission.to_csv('submission7.csv', index=False)
# print(submission.head())

## MLP

In [None]:
# mlp = MLPClassifier(
#     max_iter = 300, 
#     early_stopping = True, 
#     random_state = 369
#     )

# param_grid_mlp = {
#     'hidden_layer_sizes': [(32, 16)],
#     'activation': ['relu'],
#     'solver': ['adam'],
#     'alpha': [0.01],
#     'learning_rate': ['constant'],
#     'learning_rate_init': [0.001],
#     'tol': [1e-3],
#     'batch_size': [32]
# }

# grid_search_mlp = GridSearchCV(
#     estimator = mlp,
#     param_grid = param_grid_mlp,
#     cv = 10,
#     scoring = 'accuracy',
#     n_jobs = -1,
#     verbose = 1
# )

# grid_search_mlp.fit(X_train, y_train)

# print("Best Parameters:", grid_search_mlp.best_params_)
# print("Best CV Score:", grid_search_mlp.best_score_)

# y_hat_test = grid_search_mlp.predict(X_test)

# submission = pd.DataFrame({
#     'id': test_ids,
#     'class': y_hat_test
# })

# submission.to_csv('submission8.csv', index=False)
# print(submission.head())