In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score
from xgboost import XGBClassifier

# Step 1: Load the dataset
file_path = 'relevant_features_dataset.csv'
df = pd.read_csv(file_path)

# Step 2: Remove rows with the "never" class in the target column
target_column = 'How_often_do_you_feel_stressed'
df = df[df[target_column] != "never"]

# Step 3: Identify categorical columns and apply Label Encoding
categorical_cols = df.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Step 4: Separate features and target
X = df.drop(columns=[target_column])
y = df[target_column]

# Step 5: Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(f"Number of features after scaling: {X_scaled.shape[1]}")

# Step 6: Define the XGBoost model
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)

# Step 7: Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

# Step 8: Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

print("Running Grid Search...")
grid_search.fit(X_scaled, y)

# Step 9: Print Best Parameters and Results
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.2f}")

# Step 10: Evaluate the best model on cross-validation
best_xgb = grid_search.best_estimator_
cv_scores_best = cross_val_score(best_xgb, X_scaled, y, cv=5, scoring='accuracy')

print(f"Cross-Validation Accuracy Scores (Best XGBoost): {cv_scores_best}")
print(f"Mean CV Accuracy (Best XGBoost): {cv_scores_best.mean():.2f}")
print(f"Standard Deviation of CV Accuracy (Best XGBoost): {cv_scores_best.std():.2f}")


Number of features after scaling: 28
Running Grid Search...
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters: {'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.8}
Best Cross-Validation Accuracy: 0.38
Cross-Validation Accuracy Scores (Best XGBoost): [0.41860465 0.35714286 0.47619048 0.30952381 0.33333333]
Mean CV Accuracy (Best XGBoost): 0.38
Standard Deviation of CV Accuracy (Best XGBoost): 0.06
