In [2]:
import pandas as pd

df = pd.read_csv("mental_health_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   User_ID                  50000 non-null  int64  
 1   Age                      50000 non-null  int64  
 2   Gender                   50000 non-null  object 
 3   Occupation               50000 non-null  object 
 4   Country                  50000 non-null  object 
 5   Mental_Health_Condition  50000 non-null  object 
 6   Severity                 24998 non-null  object 
 7   Consultation_History     50000 non-null  object 
 8   Stress_Level             50000 non-null  object 
 9   Sleep_Hours              50000 non-null  float64
 10  Work_Hours               50000 non-null  int64  
 11  Physical_Activity_Hours  50000 non-null  int64  
 12  Social_Media_Usage       50000 non-null  float64
 13  Diet_Quality             50000 non-null  object 
 14  Smoking_Habit         

In [3]:
df['Severity'] = df['Severity'].fillna('None')
df.isnull().sum()

User_ID                    0
Age                        0
Gender                     0
Occupation                 0
Country                    0
Mental_Health_Condition    0
Severity                   0
Consultation_History       0
Stress_Level               0
Sleep_Hours                0
Work_Hours                 0
Physical_Activity_Hours    0
Social_Media_Usage         0
Diet_Quality               0
Smoking_Habit              0
Alcohol_Consumption        0
Medication_Usage           0
dtype: int64

In [4]:
df.describe()

Unnamed: 0,User_ID,Age,Sleep_Hours,Work_Hours,Physical_Activity_Hours,Social_Media_Usage
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,25000.5,41.47308,7.009934,55.06286,4.98204,3.24316
std,14433.901067,13.844185,1.732674,14.691575,3.161759,1.585235
min,1.0,18.0,4.0,30.0,0.0,0.5
25%,12500.75,29.0,5.5,42.0,2.0,1.9
50%,25000.5,41.0,7.0,55.0,5.0,3.2
75%,37500.25,53.0,8.5,68.0,8.0,4.6
max,50000.0,65.0,10.0,80.0,10.0,6.0


In [5]:
numerical_features = [
    'Age',
    'Sleep_Hours',
    'Work_Hours',
    'Physical_Activity_Hours',
    'Social_Media_Usage'
]

# We must define the exact order for each of these features.
ordinal_features = [
    'Severity',
    'Stress_Level',
    'Diet_Quality',
    'Smoking_Habit',
    'Alcohol_Consumption'
]

# Define the explicit order for each ordinal feature
severity_order = ['None', 'Low', 'Medium', 'High']
stress_order = ['Low', 'Medium', 'High']
diet_order = ['Healthy', 'Average', 'Unhealthy']
smoking_order = ['Non-Smoker', 'Occasional Smoker', 'Regular Smoker', 'Heavy Smoker']
alcohol_order = ['Non-Drinker', 'Social Drinker', 'Regular Drinker', 'Heavy Drinker']

# These will be one-hot encoded. We are excluding 'Country'.
nominal_features = [
    'Gender',
    'Occupation',
    'Consultation_History',
    'Medication_Usage'
]

print("✅ Feature types identified successfully!")
print(f"Numerical: {numerical_features}")
print(f"Ordinal: {ordinal_features}")
print(f"Nominal: {nominal_features}")

✅ Feature types identified successfully!
Numerical: ['Age', 'Sleep_Hours', 'Work_Hours', 'Physical_Activity_Hours', 'Social_Media_Usage']
Ordinal: ['Severity', 'Stress_Level', 'Diet_Quality', 'Smoking_Habit', 'Alcohol_Consumption']
Nominal: ['Gender', 'Occupation', 'Consultation_History', 'Medication_Usage']


In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder

# Define the transformers for each feature type
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('ord', OrdinalEncoder(categories=[severity_order, stress_order, diet_order, smoking_order, alcohol_order]), ordinal_features),
        ('nom', OneHotEncoder(handle_unknown='ignore', sparse_output=False), nominal_features)
    ],
    remainder='drop'  # Drop columns not specified (like 'Country')
)

print("✅ Preprocessing pipeline created.")

✅ Preprocessing pipeline created.


In [8]:
from sklearn.model_selection import train_test_split

TARGET_COLUMN = 'Mental_Health_Condition'
# Convert the target variable 'Yes'/'No' to 1/0
df[TARGET_COLUMN] = df[TARGET_COLUMN].map({'Yes': 1, 'No': 0})

# Separate features (X) and target (y) from the dataframe
X = df.drop(TARGET_COLUMN, axis=1)
y = df[TARGET_COLUMN]

# Split data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Essential for balanced classes
)

# Fit the preprocessor on the training data and transform both sets
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f"Shape of processed training data: {X_train_processed.shape}")
print(f"Shape of processed testing data: {X_test_processed.shape}")
print("\n✅ Data is preprocessed and ready for model training!")

Shape of processed training data: (40000, 25)
Shape of processed testing data: (10000, 25)

✅ Data is preprocessed and ready for model training!


## Using Random Forest

In [15]:
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
import optuna

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "max_depth": trial.suggest_int("max_depth", 5, 30),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        "class_weight": "balanced"  # Handle class imbalance
    }

    model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
    model.fit(X_train_processed, y_train)
    preds = model.predict(X_test_processed)
    return f1_score(y_test, preds)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)
print("Best hyperparameters:", study.best_params)
print("Best F1 score:", study.best_value)

[I 2025-07-25 16:39:26,874] A new study created in memory with name: no-name-8973a774-b7dd-4883-a453-f89808273857
[I 2025-07-25 16:39:39,151] Trial 0 finished with value: 0.5002525507627033 and parameters: {'n_estimators': 530, 'max_depth': 18, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': None, 'bootstrap': True}. Best is trial 0 with value: 0.5002525507627033.
[I 2025-07-25 16:39:45,648] Trial 1 finished with value: 0.48437660075811906 and parameters: {'n_estimators': 987, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2', 'bootstrap': False}. Best is trial 0 with value: 0.5002525507627033.
[I 2025-07-25 16:39:48,696] Trial 2 finished with value: 0.5026523871484336 and parameters: {'n_estimators': 440, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 9, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 2 with value: 0.5026523871484336.
[I 2025-07-25 16:39:54,269] Trial 3 finished with value: 0.4999500649156097

Best hyperparameters: {'n_estimators': 592, 'max_depth': 30, 'min_samples_split': 17, 'min_samples_leaf': 4, 'max_features': None, 'bootstrap': True}
Best F1 score: 0.5088579721749574
