In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from xgboost import XGBClassifier


In [2]:
train_df = pd.read_csv('../processed_files/preprocessed_data.csv')


In [3]:
test_df  = pd.read_csv('../processed_files/preprocessed_test.csv')

In [4]:
train_df['distance_from_home'].describe()

count    54831.000000
mean        50.017709
std         28.530596
min          1.000000
25%         25.000000
50%         50.000000
75%         75.000000
max        297.000000
Name: distance_from_home, dtype: float64

In [5]:
train_df['leave'] = np.where((train_df['distance_from_home'] > 40) & (train_df['remote_work'] == 0), 1, 0)

In [6]:
test_df['leave'] = np.where((test_df['distance_from_home'] > 40) & (test_df['remote_work'] == 0), 1, 0)

In [7]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean = False, with_std = False)

In [8]:
X = train_df.drop(['exit_status'],axis=1)
Y = train_df['exit_status']

In [9]:
x=test_df['response_id']
# test_df.drop(['response_id'],axis=1,inplace=True)

In [10]:
train_df = scaler.fit_transform(train_df)
test_df = scaler.fit_transform(test_df)

In [11]:
model_xg = XGBClassifier( random_state=42)

In [12]:
param_grid_xg = {
    'n_estimators': [400],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.6],
    'colsample_bytree': [0.8, 0.5],
    'gamma': [0.2, 0.1],
    'min_child_weight': [1],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [1]
}

In [13]:
grid_search = GridSearchCV(
    estimator=model_xg, 
    param_grid=param_grid_xg, 
    scoring='f1',  # or another scoring metric you prefer
    cv=5,                # Number of cross-validation folds
    verbose=2,           # Level of verbosity
    n_jobs=-1            # Use all available cores
)


In [14]:
grid_search.fit(X, Y)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=400, reg_alpha=0, reg_lambda=1, subsample=0.6; total time=   2.8s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=400, reg_alpha=0.1, reg_lambda=1, subsample=0.6; total time=   2.9s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=5, min_child_weight=1, n_estimators=400, reg_alpha=0, reg_lambda=1, subsample=0.6; total time=   3.6s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=5, min_child_weight=1, n_estimators=400, reg_alpha=0.1, reg_lambda=1, subsample=0.6; total time=   4.8s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=400, reg_alpha=0.1, reg_lambda=1, subsample=0.8; total time=   5.5s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.1, m

[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=300, reg_alpha=0, reg_lambda=1, subsample=0.8; total time=   5.1s
[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=300, reg_alpha=0, reg_lambda=1, subsample=0.6; total time=   5.2s
[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=300, reg_alpha=0.1, reg_lambda=1, subsample=0.6; total time=   5.0s
[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=300, reg_alpha=0, reg_lambda=1, subsample=0.6; total time=   2.2s
[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=300, reg_alpha=0.1, reg_lambda=1, subsample=0.8; total time=   2.1s
[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=300, reg_alpha=0, reg

[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.05, max_depth=5, min_child_weight=1, n_estimators=300, reg_alpha=0, reg_lambda=1, subsample=0.6; total time=   3.6s
[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.05, max_depth=5, min_child_weight=1, n_estimators=300, reg_alpha=0.1, reg_lambda=1, subsample=0.6; total time=   3.8s
[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=300, reg_alpha=0, reg_lambda=1, subsample=0.8; total time=   5.8s
[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=300, reg_alpha=0.1, reg_lambda=1, subsample=0.8; total time=   5.4s
[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=300, reg_alpha=0.1, reg_lambda=1, subsample=0.6; total time=   4.8s
[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=300, reg_alpha=0.

[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=5, min_child_weight=1, n_estimators=400, reg_alpha=0, reg_lambda=1, subsample=0.6; total time=   4.4s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=5, min_child_weight=1, n_estimators=400, reg_alpha=0.1, reg_lambda=1, subsample=0.8; total time=   4.3s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=400, reg_alpha=0, reg_lambda=1, subsample=0.8; total time=   6.7s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=400, reg_alpha=0, reg_lambda=1, subsample=0.6; total time=   6.7s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=400, reg_alpha=0.1, reg_lambda=1, subsample=0.6; total time=   6.3s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=400, reg_alpha=0, r

[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=5, min_child_weight=1, n_estimators=400, reg_alpha=0, reg_lambda=1, subsample=0.6; total time=   4.5s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=5, min_child_weight=1, n_estimators=400, reg_alpha=0.1, reg_lambda=1, subsample=0.6; total time=   4.5s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=400, reg_alpha=0, reg_lambda=1, subsample=0.6; total time=   7.2s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=400, reg_alpha=0.1, reg_lambda=1, subsample=0.8; total time=   6.8s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=400, reg_alpha=0, reg_lambda=1, subsample=0.8; total time=   3.0s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=400, reg_alpha=0, re

[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=5, min_child_weight=1, n_estimators=400, reg_alpha=0.1, reg_lambda=1, subsample=0.8; total time=   4.9s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=400, reg_alpha=0, reg_lambda=1, subsample=0.8; total time=   7.2s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=400, reg_alpha=0.1, reg_lambda=1, subsample=0.8; total time=   6.9s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=400, reg_alpha=0.1, reg_lambda=1, subsample=0.6; total time=   6.1s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=400, reg_alpha=0.1, reg_lambda=1, subsample=0.8; total time=   2.7s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=400, reg_alpha=0

[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=5, min_child_weight=1, n_estimators=400, reg_alpha=0, reg_lambda=1, subsample=0.6; total time=   4.7s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=5, min_child_weight=1, n_estimators=400, reg_alpha=0.1, reg_lambda=1, subsample=0.6; total time=   4.8s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=400, reg_alpha=0, reg_lambda=1, subsample=0.6; total time=   7.3s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=400, reg_alpha=0.1, reg_lambda=1, subsample=0.6; total time=   6.9s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=400, reg_alpha=0, reg_lambda=1, subsample=0.8; total time=   3.0s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=400, reg_alpha=0, re

In [15]:
final = grid_search.predict(test_df)

In [16]:
predictions = ["Stayed" if pred == 1 else "Left" for pred in final]

In [17]:
output_df = pd.DataFrame({
    'response_id': x,
    'Predictions': predictions
})

In [18]:
output_df.to_csv('xgboost_output_35.csv', index=False)

In [19]:
print(grid_search.best_params_)

{'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 400, 'reg_alpha': 0.1, 'reg_lambda': 1, 'subsample': 0.8}
