In [1]:
import pandas as pd
from catboost.utils import eval_metric
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import root_mean_squared_error

In [2]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier, Booster

In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [4]:
import numpy as np

In [5]:
import optuna

In [6]:
import random

In [7]:
import pickle
from joblib import dump, load

In [8]:
train_df = pd.read_csv("data/train_df_featured.csv")

In [9]:
train_df.head()

Unnamed: 0,Name,Gender,Age,City,Working Professional or Student,Profession,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,Pressure,Study/Job Satisfaction,Age group,Field of Study,Qualification
0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0,5.0,2.0,middle aged,Hospitality,Bachelor
1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1,4.0,3.0,adult,Law,Bachelor
2,Yuvraj,Male,33.0,Visakhapatnam,Student,Student,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1,5.0,2.0,adult,Pharmacy,Bachelor
3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1,5.0,1.0,adult,Business Administration,Bachelor
4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0,1.0,1.0,adult,Business Administration,Bachelor


In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140653 entries, 0 to 140652
Data columns (total 19 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   Name                                   140653 non-null  object 
 1   Gender                                 140653 non-null  object 
 2   Age                                    140653 non-null  float64
 3   City                                   140653 non-null  object 
 4   Working Professional or Student        140653 non-null  object 
 5   Profession                             140653 non-null  object 
 6   Sleep Duration                         140653 non-null  object 
 7   Dietary Habits                         140653 non-null  object 
 8   Degree                                 140653 non-null  object 
 9   Have you ever had suicidal thoughts ?  140653 non-null  object 
 10  Work/Study Hours                       140653 non-null  

In [11]:
X = train_df.drop(columns=["Depression"])
y = train_df["Depression"]

In [12]:
models = {}

In [13]:
def encode_cat(df, is_train=True, ohe=False):
    df = df.copy(deep=True)
    col_to_le = [
        "Name",
        "City",
        "Profession",
        "Sleep Duration",
        "Degree",
        "Field of Study"
    ]
    le = LabelEncoder()
    df[col_to_le] = df[col_to_le].apply(le.fit_transform)
    col_to_ohe = list(df.select_dtypes(include=["object"]))
    if ohe is False:
        ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        data_encoded = ohe.fit_transform(df[col_to_ohe])
    else:
        data_encoded = ohe.transform(df[col_to_ohe])
    df_encoded = pd.DataFrame(
        data_encoded,
        columns=ohe.get_feature_names_out(col_to_ohe)
    )
    df_res = pd.concat([
        df.drop(columns=col_to_ohe).reset_index(drop=True),
        df_encoded.reset_index(drop=True),
    ], axis=1)
    if is_train is True:
        return df_res, ohe
    else:
        return df_res


In [19]:
X_enc, ohe = encode_cat(X)
X_enc.head()

Unnamed: 0,Name,Age,City,Profession,Work/Study Hours,Financial Stress,Pressure,Study/Job Satisfaction,Gender_Female,Gender_Male,...,Field of Study_Not Stated,Field of Study_Pharmacy,Field of Study_Science,Field of Study_Specialist,Field of Study_Technology,Qualification_Bachelor,Qualification_High School,Qualification_Masters,Qualification_Not Stated,Qualification_PhD
0,11,49.0,50,10,1.0,2.0,5.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,407,26.0,93,56,7.0,3.0,4.0,3.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,417,33.0,97,55,3.0,1.0,5.0,2.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,417,22.0,64,56,10.0,1.0,5.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,286,30.0,37,9,9.0,4.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
X_train_cb, X_test_cb, y_train_cb, y_test_cb = train_test_split(X, y, test_size=0.2,
                                                    random_state=random.randint(1,100))

cat_features = [X_train_cb.columns.get_loc(col) for col in X_train_cb.select_dtypes(include='object')]
def objective_cb(trial):
    params = {
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Poisson"]),
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.000001, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0, 2),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.05, 10.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        # "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        #"border_count": trial.suggest_int("border_count", 1, 255),
    }


    model = CatBoostClassifier(**params, task_type="GPU", silent=True)
    model.fit(X_train_cb, y_train_cb,
              cat_features=cat_features,
              )
    predictions = model.predict(X_test_cb)
    rmse = root_mean_squared_error(y_test_cb, predictions)
    return rmse

In [17]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_cb, n_trials=100)
best_hyperparameters_cat = study.best_params

[I 2024-11-21 18:21:53,571] A new study created in memory with name: no-name-1fc729d1-23b0-4c85-bb8b-1011940ef2d7
[I 2024-11-21 18:22:44,228] Trial 0 finished with value: 0.4264128008393195 and parameters: {'bootstrap_type': 'Poisson', 'iterations': 483, 'learning_rate': 1.9176126045160765e-06, 'depth': 8, 'l2_leaf_reg': 0.39146609322905745, 'subsample': 0.11080992008042814, 'bagging_temperature': 8.592845496160727, 'min_data_in_leaf': 23}. Best is trial 0 with value: 0.4264128008393195.
[I 2024-11-21 18:22:58,896] Trial 1 finished with value: 0.4264128008393195 and parameters: {'bootstrap_type': 'Poisson', 'iterations': 411, 'learning_rate': 1.1140604039903414e-05, 'depth': 2, 'l2_leaf_reg': 0.21364611467594052, 'subsample': 0.35363619461961354, 'bagging_temperature': 2.0860588795184776, 'min_data_in_leaf': 42}. Best is trial 0 with value: 0.4264128008393195.
[I 2024-11-21 18:24:10,877] Trial 2 finished with value: 0.29092906575985084 and parameters: {'bootstrap_type': 'Poisson', 'ite

In [18]:
cbc = CatBoostClassifier(**best_hyperparameters_cat, task_type="GPU",
                         cat_features=cat_features
                         )
cbc.fit(X_train_cb, y_train_cb)
models['cbc'] = cbc
pred = cbc.predict(X_test_cb)
acc_sc = accuracy_score(y_test_cb, pred)
print(f"Catboost accuracy score: {acc_sc}")

0:	learn: 0.6045157	total: 71.3ms	remaining: 53.3s
1:	learn: 0.5336875	total: 141ms	remaining: 52.6s
2:	learn: 0.4728357	total: 214ms	remaining: 53.1s
3:	learn: 0.4230587	total: 285ms	remaining: 53s
4:	learn: 0.3832986	total: 372ms	remaining: 55.4s
5:	learn: 0.3510324	total: 481ms	remaining: 59.6s
6:	learn: 0.3235559	total: 575ms	remaining: 1m
7:	learn: 0.2998652	total: 657ms	remaining: 1m
8:	learn: 0.2806433	total: 727ms	remaining: 59.8s
9:	learn: 0.2637112	total: 796ms	remaining: 58.8s
10:	learn: 0.2492957	total: 872ms	remaining: 58.5s
11:	learn: 0.2383468	total: 950ms	remaining: 58.3s
12:	learn: 0.2286535	total: 1.04s	remaining: 58.7s
13:	learn: 0.2202275	total: 1.12s	remaining: 58.6s
14:	learn: 0.2131321	total: 1.21s	remaining: 59.2s
15:	learn: 0.2070015	total: 1.3s	remaining: 59.6s
16:	learn: 0.2015989	total: 1.4s	remaining: 1m
17:	learn: 0.1969521	total: 1.48s	remaining: 60s
18:	learn: 0.1929183	total: 1.55s	remaining: 59.5s
19:	learn: 0.1895452	total: 1.65s	remaining: 1m
20:	lea

In [22]:
cbc.save_model("models/catboost_after_night.cbm", format="cbm")

In [29]:
cbc.save_model("models/catboost.cbm", format="cbm")

In [30]:
cbc = CatBoostClassifier().load_model("models/catboost_after_night.cbm")
models['cbc'] = cbc
cbc

<catboost.core.CatBoostClassifier at 0x2028b26dd60>

In [20]:
def objective_xgb(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 10),
        'objective': 'binary:logistic',  # Для бинарной классификации
        'tree_method': 'hist'  # Можно заменить на 'gpu_hist' для ускорения на GPU
    }
    accuracy_mean = []
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random.randint(1, 100))
    for fold, (train_idx, val_idx) in enumerate(cv.split(X_enc, y)):
        xgb = XGBClassifier(**params)
        X_train_st, X_val_st = X_enc.iloc[train_idx], X_enc.iloc[val_idx]
        y_train_st, y_val_st = y.iloc[train_idx], y.iloc[val_idx]
        xgb.fit(X_train_st, y_train_st)
        pred = xgb.predict(X_val_st)
        accuracy_mean.append(accuracy_score(pred, y_val_st))
        print(f"Fold {fold} done!")
    acc = sum(accuracy_mean) / len(accuracy_mean)
    return acc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_xgb, n_trials=30)
best_hyperparameters_xgb = study.best_params

In [24]:
xgb = XGBClassifier(**best_hyperparameters_xgb)
xgb.fit(X_enc, y)

In [None]:
xgb = XGBClassifier(**best_hyperparameters_xgb)
xgb.fit(X_train_xgb, y_train_xgb)
models["xgb"] = xgb
pred = xgb.predict(X_test_xgb)
acc_sc = accuracy_score(y_test_xgb, pred)
print(f"XGBoost accuracy score: {acc_sc}")

In [25]:
xgb.save_model("models/xgboost.json")
dump(ohe, "models/ohe.joblib")

['models/ohe.joblib']

In [19]:
xgb = XGBClassifier()
xgb.load_model("models/xgboost.json")
models["xgb"] = xgb
ohe = OneHotEncoder()
ohe = load("models/ohe.joblib")

TypeError: file must have 'read' and 'readline' attributes

In [26]:
def objective_lgbm(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 10.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 10.0),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0.0, 1.0),
        'objective': 'binary',  # Для бинарной классификации
        'boosting_type': 'gbdt'  # Используем градиентный бустинг
    }

    accuracy_mean = []
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random.randint(1, 100))
    for fold, (train_idx, val_idx) in enumerate(cv.split(X_enc, y)):
        lgbm = LGBMClassifier(**params)
        X_train_st, X_val_st = X_enc.iloc[train_idx], X_enc.iloc[val_idx]
        y_train_st, y_val_st = y.iloc[train_idx], y.iloc[val_idx]
        lgbm.fit(X_train_st, y_train_st)
        pred = lgbm.predict(X_val_st)
        accuracy_mean.append(accuracy_score(pred, y_val_st))
        print(f"Fold {fold} done!")
    acc = sum(accuracy_mean) / len(accuracy_mean)
    return acc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_lgbm, n_trials=50)
best_hyperparameters_lgbm = study.best_params

In [28]:
lgbm = LGBMClassifier(**best_hyperparameters_lgbm)
lgbm.fit(X_enc, y)

[LightGBM] [Info] Number of positive: 25544, number of negative: 115109
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008285 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 511
[LightGBM] [Info] Number of data points in the train set: 140653, number of used features: 74
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181610 -> initscore=-1.505477
[LightGBM] [Info] Start training from score -1.505477


In [25]:
lgbm = LGBMClassifier(**best_hyperparameters_lgbm)
lgbm.fit(X_train_lgbm, y_train_lgbm)
models["lgbm"] = lgbm
pred = lgbm.predict(X_test_lgbm)
acc_sc = accuracy_score(y_test_lgbm, pred)
print(f"LightGBM accuracy score: {acc_sc}")

LightGBM accuracy score: 0.9388574881802993


In [None]:
lgbm.booster_.save_model("models/lgbm.txt")

In [12]:
lgbm = Booster(model_file="models/lgbm.txt")
models["lgbm"] = lgbm

In [31]:
valid_df = pd.read_csv('data/test_df_featured.csv')
X_valid = valid_df.drop(columns=["id"])
X_valid_enc = encode_cat(X_valid, is_train=False, ohe=ohe)
y_pred = pd.DataFrame()

In [33]:
y_pred["cb"] = cbc.predict(X_valid)
y_pred["xgb"] = xgb.predict(X_valid_enc)
y_pred["lgbm"] = lgbm.predict(X_valid_enc)




Unnamed: 0,cb,xgb,lgbm
0,0,0,0
1,0,0,0
2,0,0,0
3,1,1,1
4,0,0,0


In [41]:
y_pred["Depression"] = y_pred.mean(axis=1)
y_pred = y_pred[["Depression"]]
y_pred["Depression"] = y_pred["Depression"].astype(int)
y_pred.head()

Unnamed: 0,Depression
0,0
1,0
2,0
3,1
4,0


In [42]:
to_submission = pd.concat([valid_df["id"].reset_index(drop=True), y_pred], axis=1)
to_submission.to_csv("data/submission.csv", index=False)

In [29]:
y_pred["Depression"] = y_pred.mean(axis=1)
y_pred.head()

Unnamed: 0,cb_0,cb_1,cb_2,cb_3,cb_4,cb_5,cb_6,cb_7,cb_8,xgb_0,...,lgbm_0,lgbm_1,lgbm_2,lgbm_3,lgbm_4,lgbm_5,lgbm_6,lgbm_7,lgbm_8,Depression
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [31]:
y_pred["Depression"] = y_pred.mean(axis=1)
y_pred = y_pred[["Depression"]]
y_pred["Depression"] = y_pred["Depression"].apply(lambda x: 1 if x >= 0.5 else 0)
y_pred.head()

Unnamed: 0,Depression
0,0
1,0
2,0
3,1
4,0


In [32]:
to_submission = pd.concat([valid_df["id"].reset_index(drop=True), y_pred], axis=1)
to_submission.to_csv("data/submission.csv", index=False)

In [26]:
valid_df = pd.read_csv('data/test_df_featured.csv')
X_valid = valid_df.drop(columns=["id"])
X_valid_enc = encode_cat(X_valid, is_train=False, ohe=ohe)

In [13]:
X_valid.head()

Unnamed: 0,Name,Gender,Age,City,Working Professional or Student,Profession,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Pressure,Study/Job Satisfaction,Age group
0,Shivam,Male,53.0,Visakhapatnam,Working Professional,Judge,Less than 5 hours,Moderate,LLB,No,9.0,3.0,Yes,2.0,5.0,middle aged
1,Sanya,Female,58.0,Kolkata,Working Professional,Educational Consultant,Less than 5 hours,Moderate,B.Ed,No,6.0,4.0,No,2.0,4.0,middle aged
2,Yash,Male,53.0,Jaipur,Working Professional,Teacher,7-8 hours,Moderate,B.Arch,Yes,12.0,4.0,No,4.0,1.0,middle aged
3,Nalini,Female,23.0,Rajkot,Student,Student,More than 8 hours,Moderate,BSc,Yes,10.0,4.0,No,5.0,1.0,adult
4,Shaurya,Male,47.0,Kalyan,Working Professional,Teacher,7-8 hours,Moderate,BCA,Yes,3.0,4.0,No,5.0,5.0,middle aged


In [27]:
y_valid_cb = models["cbc"].predict(X_valid)
y_valid_xgb = models["xgb"].predict(X_valid_enc)
y_valid_lgbm = models["lgbm"].predict(X_valid_enc)

y_valid = [1 if sum(res) > 1 else 0 for res in zip(y_valid_cb, y_valid_xgb, y_valid_lgbm)]
y_valid = pd.Series(y_valid, name="Depression")

In [210]:
y_valid_cb_pr = models["cbc"].predict_proba(X_valid)[:,1]
y_valid_xgb_pr = models["xgb"].predict_proba(X_valid_enc)[:,1]
y_valid_lgbm_pr = models["lgbm"].predict_proba(X_valid_enc)[:,1]

y_valid_pr = (y_valid_cb_pr + y_valid_xgb_pr + y_valid_lgbm_pr) / 3
y_valid_binary = pd.Series((y_valid_pr > 0.5).astype(int), name="Depression")

y_valid_binary.describe()



count    93800.000000
mean         0.174733
std          0.379741
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: Depression, dtype: float64

In [37]:
to_submission = pd.concat([valid_df["id"].reset_index(drop=True), y_valid], axis=1)
to_submission.to_csv("data/submission.csv", index=False)

NameError: name 'y_valid' is not defined

In [212]:
to_submission_pr = pd.concat([valid_df["id"].reset_index(drop=True), pd.Series(y_valid_binary, name="Depression")], axis=1)
to_submission.to_csv("data/submission_binary.csv", index=False)