In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from xgboost import XGBClassifier


In [2]:
train_df = pd.read_csv('../train_test_files/processed_train.csv')

In [3]:
test_df  = pd.read_csv('../train_test_files/processed_test.csv')

In [4]:
x=test_df['response_id']
test_df.drop(['response_id'],axis=1,inplace=True)

In [5]:
model_xg = XGBClassifier( random_state=42)

In [6]:
# param_grid_xg={
#     'n_estimators': [50, 100, 150],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 4, 5],
#     'subsample': [0.8, 1.0],
#     'colsample_bytree': [0.8, 1.0]
# } 
# 74.405
param_grid_xg = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1],
    'min_child_weight': [1, 3],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [1, 2]
}
# #74.494

In [7]:
grid_search = GridSearchCV(
    estimator=model_xg, 
    param_grid=param_grid_xg, 
    scoring='f1',  # or another scoring metric you prefer
    cv=5,                # Number of cross-validation folds
    verbose=2,           # Level of verbosity
    n_jobs=-1            # Use all available cores
)


In [8]:
grid_search.fit(train_df.drop(['exit_status'],axis=1), train_df['exit_status'])

Fitting 5 folds for each of 768 candidates, totalling 3840 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=1.0; total time=   0.7s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0, reg_lambda=2, subsample=1.0; total time=   0.7s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0.1, reg_lambda=1, subsample=1.0; total time=   0.7s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=200, reg_alpha=0, reg_lambda=1, subsample=0.8; total time=   1.5s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=200, reg_alpha=0, reg_lambda=2, subsample=1.0; total time=   1.4s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, m

In [9]:
final = grid_search.predict(test_df)

In [10]:
predictions = ["Stayed" if pred == 1 else "Left" for pred in final]

In [11]:
output_df = pd.DataFrame({
    'response_id': x,
    'Predictions': predictions
})

In [12]:
output_df.to_csv('../output/xgboost_output.csv', index=False)

In [13]:
print(grid_search.best_params_)

{'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'reg_alpha': 0.1, 'reg_lambda': 2, 'subsample': 0.8}


In [14]:
y_predict = grid_search.predict(train_df.drop(['exit_status'],axis=1))

In [15]:
from sklearn.metrics import f1_score
print("the f1 score is :" , f1_score(train_df['exit_status'],y_predict))

the f1 score is : 0.7719805626082423
