In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Step 1: Load the dataset
file_path = 'relevant_features_dataset.csv'
df = pd.read_csv(file_path)
if "Unnamed_0" in df.columns:
    df = df.drop(columns=["Unnamed_0"])  # Remove the "Unnamed_0" column

# Step 2: Remove rows with the "never" class in the target column
target_column = 'How_often_do_you_feel_stressed'
df = df[df[target_column] != "never"]

# Step 3: Identify categorical columns and apply Label Encoding
categorical_cols = df.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Step 4: Separate features and target
X = df.drop(columns=[target_column])
y = df[target_column]

# Step 5: Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(f"Number of features after scaling: {X_scaled.shape[1]}")

# Step 6: Define models and hyperparameter grids
models = {
    'Logistic Regression': {
        'model': LogisticRegression(max_iter=1000, random_state=42),
        'params': {
            'C': [0.01, 0.1, 1, 10],
            'solver': ['liblinear', 'lbfgs']
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [5, 10, None],
            'min_samples_split': [2, 10]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(eval_metric='logloss', random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 6, 10],
            'learning_rate': [0.01, 0.1, 0.2],
            'subsample': [0.8, 1.0]
        }
    },
    'LightGBM': {
        'model': LGBMClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 6, 10],
            'learning_rate': [0.01, 0.1, 0.2],
            'subsample': [0.8, 1.0]
        }
    },
    'SVM': {
        'model': SVC(random_state=42, probability=True),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        }
    }
}

# Step 7: Perform Grid Search for Each Model with SMOTE
results = {}
for name, config in models.items():
    print(f"Running Grid Search for {name}...")
    # Use Pipeline to include SMOTE in the workflow
    pipeline = Pipeline(steps=[
        ('smote', SMOTE(random_state=42)),
        ('scaler', StandardScaler()),  # Optional: Include scaler here
        ('model', config['model'])
    ])
    grid_search = GridSearchCV(estimator=pipeline, param_grid={'model__' + k: v for k, v in config['params'].items()},
                               cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X, y)  # Use raw features; pipeline handles scaling and SMOTE
    results[name] = {
        'Best Parameters': grid_search.best_params_,
        'Best CV Accuracy': grid_search.best_score_
    }
    print(f"Best Parameters for {name}: {grid_search.best_params_}")
    print(f"Best Cross-Validation Accuracy for {name}: {grid_search.best_score_:.2f}")

# Step 8: Compare Results
print("\nModel Comparison:")
for name, result in results.items():
    print(f"{name}: Best CV Accuracy = {result['Best CV Accuracy']:.2f}, Best Parameters = {result['Best Parameters']}")


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Number of features after scaling: 27
Running Grid Search for Logistic Regression...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Parameters for Logistic Regression: {'model__C': 0.1, 'model__solver': 'liblinear'}
Best Cross-Validation Accuracy for Logistic Regression: 0.42
Running Grid Search for Random Forest...
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Parameters for Random Forest: {'model__max_depth': 10, 'model__min_samples_split': 2, 'model__n_estimators': 200}
Best Cross-Validation Accuracy for Random Forest: 0.41
Running Grid Search for XGBoost...
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters for XGBoost: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 200, 'model__subsample': 0.8}
Best Cross-Validation Accuracy for XGBoost: 0.41
Running Grid Search for LightGBM...
Fitting 5 folds for each of 54 candidates, totalling 270 fits
[LightGBM] [Info] Auto-choosing row-wise multi-thre