In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import Precision, Recall, AUC
from tensorflow.keras.optimizers import Adam
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


# ---------------------------------------
# Data cleaning, mapping, and correction
# ---------------------------------------

def clean_df(df, is_train=True):
    # Drop Name if present
    if "Name" in df.columns:
        df = df.drop(columns=["Name"])
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].astype(str).str.strip()

    # CITY CORRECTIONS
    invalid_cities = set([
        "Ishanabad","Vidhi","Ayush","Krishna","Aishwarya","Keshav","Harsha","Nalini","Aditya","Malyansh",
        "Raghavendra","Saanvi","Bhavna","Nandini","Atharv","Pratyush","Mira","Mihir","Vidya","Anvi",
        "Krinda","Ayansh","Shrey","Ivaan","Vaanya","Gaurav","Harsh","Reyansh","Kashish","Kibara",
        "Vaishnavi","Chhavi","Parth","Mahi","Tushar","Rashi","Armaan","Aaradhya","Pooja","Khushi",
        "Jhanvi","M.Tech","M.Com","MCA","MSc","ME","City","3.0","No","Less Delhi","Less than 5 Kalyan",
        "Moreadhyay","Researcher","Kagan","Ithal","Galesabad","Itheg","Unirar",
        "Plata", "Ishkarsh", "Kashk", "Dhruv"
    ])
    city_corrections = {
        "Tolkata": "Kolkata",
        "Molkata": "Kolkata",
        "Khaziabad": "Ghaziabad",
        "Nalyan": "Kalyan"
    }
    if 'City' in df:
        df['City'] = df['City'].apply(lambda x: "Unknown" if x in invalid_cities else x)
        df['City'] = df['City'].replace(city_corrections)
        df = df.reset_index(drop=True)

    # PROFESSION CORRECTIONS
    invalid_professions = [
        "B.Com", "BE", "MBA", "LLM", "BCA", "BBA", "MBBS", "B.Ed", "M.Ed", "PhD",
        "Student", "Working Professional", "Academic", "Profession",
        "Yogesh", "Dev", "Pranav", "Yuvraj",
        "FamilyVirar", "City Manager", "Patna", "Nagpur",
        "Unveil", "Moderate",
        "Visakhapatnam"
    ]
    profession_corrections = {
        "Finanancial Analyst": "Financial Analyst",
        "Medical Doctor": "Doctor"
    }
    if 'Profession' in df:
        df['Profession'] = df['Profession'].apply(lambda x: "Unknown" if x in invalid_professions else x)
        df['Profession'] = df['Profession'].replace(profession_corrections)
        df = df.reset_index(drop=True)

    # SLEEP DURATION
    invalid_sleep = set([
        'Sleep_Duration', 'Indore', 'Pune', 'Moderate', 'Unhealthy', 'No',
        'Work_Study_Hours', '45', '49 hours', '35-36 hours', '55-66 hours',
        '9-6 hours', '10-6 hours', '9-5', '9-5 hours',
        '40-45 hours', '45-48 hours', '1-6 hours'
    ])
    sleep_corrections = {
        'than 5 hours': 'Less than 5 hours',
        '8 hours': '7-8 hours'
    }
    def standardize_sleep_duration(val):
        if pd.isna(val) or str(val).lower() == 'unknown':
            return 'Unknown'
        val = str(val).strip().lower()
        if val in ['less than 5 hours', '1-2 hours', '1-3 hours', '2-3 hours',
                   '3-4 hours', '3-6 hours', '4-5 hours', '4-6 hours']:
            return '<5 hours'
        elif val == '5-6 hours':
            return '5-6 hours'
        elif val == '6-7 hours':
            return '6-7 hours'
        elif val == '6-8 hours':
            return '6-8 hours'
        elif val == '7-8 hours':
            return '7-8 hours'
        elif val in ['8-9 hours', 'more than 8 hours']:
            return '8-9 hours'
        elif val in ['9-11 hours', '10-11 hours']:
            return '9-11 hours'
        else:
            return val
    if 'Sleep Duration' in df:
        df['Sleep Duration'] = df['Sleep Duration'].apply(
            lambda x: 'Unknown' if x in invalid_sleep or pd.isna(x) else x
        )
        df['Sleep Duration'] = df['Sleep Duration'].replace(sleep_corrections)
        df['Sleep Duration'] = df['Sleep Duration'].apply(standardize_sleep_duration)

    # DIETARY HABITS
    diet_corrections = {
        "No Healthy": "Unhealthy",
        "Less Healthy": "Unhealthy",
        "More Healthy": "Healthy",
        "Less than Healthy": "Unhealthy"
    }
    invalid_diet = [
        "Yes", "No", "Pratham", "Mihir", "BSc", "M.Tech", "Class 12",
        "Gender", "Male", "3", "1.0", "2", "Hormonal", "Electrician",
        "Vegas", "Indoor"
    ]
    if 'Dietary Habits' in df:
        df['Dietary Habits'] = df['Dietary Habits'].apply(
            lambda x: "Unknown" if x in invalid_diet else x
        )
        df['Dietary Habits'] = df['Dietary Habits'].replace(diet_corrections)

    # DEGREE
    degree_corrections = {
        'BEd': 'B.Ed',
        'MEd': 'M.Ed',
        'MTech': 'M.Tech',
        'M_Tech': 'M.Tech',
        'BArch': 'B.Arch',
        'B BA': 'BBA',
        'B B.Com': 'B.Com',
        'BSc': 'B.Sc',
        'MSc': 'M.Sc',
        'PhD': 'Ph.D',
        'MPharm': 'M.Pharm',
        'BPharm': 'B.Pharm',
        'LLCom': 'LL.Com',
        'LLBA': 'LL.B',
        'BCA': 'B.C.A',
        'MCA': 'M.C.A',
        'MBA': 'M.B.A'
    }
    invalid_degrees = [
        'Nalini','Veda','Bhopal','Degree','20','H_Pharm','M','P.Com',
        'Business Analyst','Data Scientist','Unite','HR Manager','Badhya',
        'S.Pharm','Vrinda','M. Business Analyst','Bhavesh','0','29','Vivaan',
        'BPA','Plumber','5.61','Brit','B.03','Ritik','5.56','B','7.06','ACA',
        'Brithika','CGPA','24','Pihu','BB','Jhanvi','Entrepreneur','8.56',
        'LHM','Lata','S.Arch','Marsh','HCA','5.88','B.Student','LL B.Ed',
        'M.S','Navya','Mahika','Mthanya','Working Professional','Esha',
        'LLS','LLEd','E.Tech','Doctor','N.Pharm','LCA','Mihir','Advait',
        'UX/UI Designer', 'BH', 'S.Tech', 'Kalyan', 
        'LLTech', 'Aarav', 'B.3.79', 'LL.Com', 'K.Ed'
    ]
    if 'Degree' in df:
        df['Degree'] = df['Degree'].replace(degree_corrections)
        df['Degree'] = df['Degree'].apply(lambda x: "Unknown" if x in invalid_degrees else x)
        df['Degree'] = df['Degree'].replace({'LL.B': 'LLB'})
        df['Degree'] = df['Degree'].str.replace('.', '', regex=False).str.strip().str.upper()

    # Fill whitespace
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.strip()

    # Drop 'id' if present
    if 'id' in df.columns:
        df = df.drop(columns=['id'])

    return df


def custom_impute(df):
    mask_student = (df['Working Professional or Student'] == 'Student') & (df['Profession'].isna())
    df.loc[mask_student, 'Profession'] = 'Student'
    mask_working = (df['Working Professional or Student'] == 'Working Professional') & (df['Profession'].isna())
    df.loc[mask_working, 'Profession'] = 'Unknown'

    median_student_pressure = df.loc[df['Working Professional or Student'] == 'Student', 'Academic Pressure'].median()
    mask_student_nan = (df['Working Professional or Student'] == 'Student') & (df['Academic Pressure'].isna())
    df.loc[mask_student_nan, 'Academic Pressure'] = median_student_pressure
    mask_working_nan = (df['Working Professional or Student'] == 'Working Professional') & (df['Academic Pressure'].isna())
    df.loc[mask_working_nan, 'Academic Pressure'] = 0

    median_work_pressure = df.loc[df['Working Professional or Student'] == 'Working Professional', 'Work Pressure'].median()
    mask_nan_working = (df['Working Professional or Student'] == 'Working Professional') & (df['Work Pressure'].isna())
    df.loc[mask_nan_working, 'Work Pressure'] = median_work_pressure
    mask_nan_students = (df['Working Professional or Student'] == 'Student') & (df['Work Pressure'].isna())
    df.loc[mask_nan_students, 'Work Pressure'] = 0

    median_cgpa_students = df.loc[df['Working Professional or Student'] == 'Student', 'CGPA'].median()
    mask_student_cgpa_nan = (df['Working Professional or Student'] == 'Student') & (df['CGPA'].isna())
    df.loc[mask_student_cgpa_nan, 'CGPA'] = median_cgpa_students
    mask_working_cgpa_nan = (df['Working Professional or Student'] == 'Working Professional') & (df['CGPA'].isna())
    df.loc[mask_working_cgpa_nan, 'CGPA'] = 0

    median_study_satisfaction_students = df.loc[df['Working Professional or Student'] == 'Student', 'Study Satisfaction'].median()
    mask_student_study_nan = (df['Working Professional or Student'] == 'Student') & (df['Study Satisfaction'].isna())
    df.loc[mask_student_study_nan, 'Study Satisfaction'] = median_study_satisfaction_students
    mask_working_study_nan = (df['Working Professional or Student'] == 'Working Professional') & (df['Study Satisfaction'].isna())
    df.loc[mask_working_study_nan, 'Study Satisfaction'] = 0

    median_job_satisfaction_working = df.loc[df['Working Professional or Student'] == 'Working Professional', 'Job Satisfaction'].median()
    mask_working_job_nan = (df['Working Professional or Student'] == 'Working Professional') & (df['Job Satisfaction'].isna())
    df.loc[mask_working_job_nan, 'Job Satisfaction'] = median_job_satisfaction_working
    mask_student_job_nan = (df['Working Professional or Student'] == 'Student') & (df['Job Satisfaction'].isna())
    df.loc[mask_student_job_nan, 'Job Satisfaction'] = 0

    for col in ['Dietary Habits', 'Degree']:
        if col in df.columns and df[col].isnull().any():
            df[col].fillna(df[col].mode().iloc[0], inplace=True)
    if 'Financial Stress' in df.columns and df['Financial Stress'].isnull().any():
        df['Financial Stress'].fillna(df['Financial Stress'].median(), inplace=True)

    return df


def encode_and_align(df_train, df_test, categorical_cols, numerical_cols, log_transform_cols):
    train_encoded = pd.get_dummies(df_train, columns=categorical_cols, drop_first=False)
    test_encoded  = pd.get_dummies(df_test, columns=categorical_cols, drop_first=False)
    train_cols = set(train_encoded.columns)
    test_cols  = set(test_encoded.columns)
    missing_in_test = train_cols - test_cols
    extra_in_test = test_cols - train_cols
    for col in missing_in_test:
        test_encoded[col] = 0
    test_encoded = test_encoded[[col for col in train_encoded.columns if col in test_encoded.columns]]
    for col in log_transform_cols:
        if col in train_encoded.columns:
            train_encoded[col] = np.log1p(train_encoded[col])
        if col in test_encoded.columns:
            test_encoded[col] = np.log1p(test_encoded[col])
    return train_encoded, test_encoded


# ---------------------------------------
# Main pipeline flow
# ---------------------------------------

categorical_cols = [
    'Gender', 'City', 'Working Professional or Student', 'Profession',
    'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?',
    'Family History of Mental Illness', 'Sleep Duration'
]
numerical_cols = [
    'Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction',
    'Job Satisfaction', 'Work/Study Hours', 'Financial Stress'
]
log_transform_cols = ['Academic Pressure', 'CGPA', 'Study Satisfaction']

df_train = pd.read_csv(r"D:\GUVI\Mental_health_survey\playground-series-s4e11\train.csv")
df_test = pd.read_csv(r"D:\GUVI\Mental_health_survey\playground-series-s4e11\test.csv")

# Save test IDs before cleaning or dropping 'id'
test_ids = df_test["id"].copy()

df_train = clean_df(df_train, is_train=True)
df_test = clean_df(df_test, is_train=False)

df_train = custom_impute(df_train)
df_test = custom_impute(df_test)

if "id" in df_train.columns:
    df_train = df_train.drop(columns=["id"])

if "id" in df_test.columns:
    df_test = df_test.drop(columns=["id"])

y_train = df_train["Depression"]
X_train = df_train.drop(columns=["Depression"])
X_test = df_test.copy()

X_train_encoded, X_test_encoded = encode_and_align(
    X_train, X_test, categorical_cols, numerical_cols, log_transform_cols
)

# Train-validation split for evaluation
X_tr_enc, X_val_enc, y_tr, y_val = train_test_split(
    X_train_encoded, y_train, test_size=0.2, random_state=42, stratify=y_train)

# Scaling
scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_tr_enc)
X_val_scaled = scaler.transform(X_val_enc)
X_test_scaled = scaler.transform(X_test_encoded)

joblib.dump(scaler, "scaler_pipeline.save")

# Model creation
model = Sequential()
model.add(Dense(256, input_dim=X_tr_scaled.shape[1], activation="relu"))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(128, activation="relu"))
model.add(BatchNormalization())
model.add(Dropout(0.25))
model.add(Dense(64, activation="relu"))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(32, activation="relu"))
model.add(BatchNormalization())
model.add(Dropout(0.15))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy",
              optimizer=Adam(learning_rate=0.001),
              metrics=["accuracy", Precision(name='precision'), Recall(name='recall'), AUC(name='auc')])

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=1
)

# Train model
model.fit(
    X_tr_scaled, y_tr,
    validation_data=(X_val_scaled, y_val),
    epochs=50,
    batch_size=512,
    callbacks=[early_stop],
    verbose=1
)

model.save('mental_health_survey_final_pipeline.keras')

# Evaluate on validation set
y_val_pred_prob = model.predict(X_val_scaled)
y_val_pred = (y_val_pred_prob > 0.5).astype(int).flatten()

print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Precision:", precision_score(y_val, y_val_pred))
print("Validation Recall:", recall_score(y_val, y_val_pred))
print("Validation F1 Score:", f1_score(y_val, y_val_pred))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

# Predict and save submission
y_pred_test = model.predict(X_test_scaled)
y_pred_test_classes = (y_pred_test > 0.5).astype(int).flatten()
submission = pd.DataFrame({
    'id': test_ids,
    'Depression': y_pred_test_classes
})
submission.to_csv('submission1.csv', index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Financial Stress'].fillna(df['Financial Stress'].median(), inplace=True)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.8677 - auc: 0.9101 - loss: 0.3714 - precision: 0.6030 - recall: 0.7960 - val_accuracy: 0.9328 - val_auc: 0.9595 - val_loss: 0.2171 - val_precision: 0.8534 - val_recall: 0.7608
Epoch 2/50
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9293 - auc: 0.9572 - loss: 0.2064 - precision: 0.8223 - recall: 0.7794 - val_accuracy: 0.9348 - val_auc: 0.9692 - val_loss: 0.1778 - val_precision: 0.8183 - val_recall: 0.8244
Epoch 3/50
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9331 - auc: 0.9665 - loss: 0.1754 - precision: 0.8292 - recall: 0.7956 - val_accuracy: 0.9370 - val_auc: 0.9714 - val_loss: 0.1606 - val_precision: 0.8367 - val_recall: 0.8115
Epoch 4/50
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9345 - auc: 0.9688 - loss: 0.1679 - precision: 0.8342 - recall: 0.7981