In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV


In [2]:
model = GradientBoostingClassifier(random_state=42)


In [3]:
train_df = pd.read_csv('../train_test_files/processed_train.csv')
test_df  = pd.read_csv('../train_test_files/processed_test.csv')
x=test_df['response_id']
test_df.drop(['response_id'],axis=1,inplace=True)

In [4]:
param_grid = {
    'n_estimators': [50, 100],  # Number of trees; limit to a small set to avoid large computation.
    'learning_rate': [0.01, 0.1, 0.2],  # Step size; balances speed and accuracy.
    'max_depth': [3, 5],  # Controls the complexity of individual trees.
    'min_samples_split': [2, 5],  # Minimum samples required to split an internal node.
    'subsample': [0.8, 1.0],  # Fraction of samples used for fitting each tree.
    'max_features': ['sqrt']  # Limits the number of features considered for splits.
}


In [5]:
grid_search = GridSearchCV(
    estimator=model, 
    param_grid=param_grid, 
    scoring='f1',  # or another scoring metric you prefer
    cv=5,                # Number of cross-validation folds
    verbose=2,           # Level of verbosity
    n_jobs=-1            # Use all available cores
)


In [6]:
grid_search.fit(train_df.drop(['exit_status'],axis=1), train_df['exit_status'])


Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [7]:
final = grid_search.predict(test_df)

In [8]:
predictions = ["Stayed" if pred == 1 else "Left" for pred in final]

In [9]:
output_df = pd.DataFrame({
    'response_id': x,
    'Predictions': predictions
})

In [10]:
output_df.to_csv('../output/gradientboosting_output.csv', index=False)

In [11]:
print(grid_search.best_params_)

{'learning_rate': 0.1, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 1.0}


In [12]:
y_predict = grid_search.predict(train_df.drop(['exit_status'],axis=1))

In [13]:
from sklearn.metrics import f1_score

In [14]:
print("the f1 score is :" , f1_score(train_df['exit_status'],y_predict))

the f1 score is : 0.7772883343103593
