## Imports

In [11]:
import pandas as pd
import numpy as np
from best_th_function import best_th_function

## Read data

In [12]:
df = pd.read_csv("supply_chain_train.csv")
df_test = pd.read_csv("supply_chain_test.csv")

## Data preprocessing

In [13]:
X = df.drop(columns=["Attrition_Flag", "train_idx", "CLIENTNUM"])
y = df["Attrition_Flag"]

In [14]:
categorical_columns = ["Total_Relationship_Count", "Months_Inactive_12_mon", "Contacts_Count_12_mon", "Dependent_count"]
categorical_columns = ["Gender", "Education_Level", "Marital_Status", "Income_Category", "Card_Category"]

X[categorical_columns] = X[categorical_columns].astype("category")

In [15]:
df_test = df_test.drop(columns = ["test_idx","CLIENTNUM"])

In [16]:
df_test[categorical_columns] = df_test[categorical_columns].astype("category")

## Searching best model and training

### Xgboost balancing data

In [17]:
scale_pos_weight = (len(y)-sum(y))/sum(y)

I've searched over different params. Now appear the best ones.

In [18]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

def grid_xgboost_bal(X,y):
    gsc = GridSearchCV(
                estimator=xgb.XGBClassifier(tree_method="hist", enable_categorical=True),
                param_grid={"n_estimators": [1000],
                            "learning_rate": [0.15],
                            "max_depth": [30],
                            "min_child_weight": [1],
                            "gamma":[0.01],
                            "colsample_bytree":[0.3],
                            "scale_pos_weight": [scale_pos_weight]},
                cv=5, scoring="f1", verbose=0, n_jobs=-1)
    gsc.fit(X,y)
    return gsc

In [19]:
grid_search_xgboost_bal = grid_xgboost_bal(X, y)

In [20]:
df_grid_bal = pd.DataFrame(grid_search_xgboost_bal.cv_results_)

In [21]:
df_grid_bal[df_grid_bal["rank_test_score"]==1]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_gamma,param_learning_rate,param_max_depth,param_min_child_weight,param_n_estimators,param_scale_pos_weight,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.317884,0.032027,0.056635,0.006505,0.3,0.15,0.15,30,1,400,0.191148,"{'colsample_bytree': 0.3, 'gamma': 0.15, 'lear...",0.942904,0.946252,0.958895,0.960464,0.956745,0.953052,0.007098,1


### Xgboost without balancing data

I've searched over different params. Now appear the best ones.

In [45]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

def grid_xgboost(X,y):
    gsc = GridSearchCV(
                estimator=xgb.XGBClassifier(tree_method="hist", enable_categorical=True),
                param_grid={"n_estimators": [1000],
                            "learning_rate": [0.15],
                            "max_depth": [30],
                            "min_child_weight": [1],
                            "gamma":[0.3],
                            "colsample_bytree":[0.5]},
                cv=5, scoring="f1", verbose=0, n_jobs=-1)
    gsc.fit(X,y)
    return gsc

In [46]:
grid_search_xgboost = grid_xgboost(X, y)

In [47]:
df_grid = pd.DataFrame(grid_search_xgboost.cv_results_)
df_grid[df_grid["rank_test_score"]==1]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_gamma,param_learning_rate,param_max_depth,param_min_child_weight,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.480914,0.095863,0.025595,0.004319,0.5,0.3,0.15,30,1,1000,"{'colsample_bytree': 0.5, 'gamma': 0.3, 'learn...",0.939114,0.941772,0.947628,0.954403,0.94907,0.946397,0.005426,1


## Predictions

### PREDICT BALANCED

In [25]:
model = xgb.XGBClassifier(tree_method="hist", enable_categorical=True, **grid_search_xgboost_bal.best_params_)
model.fit(X, y)

#### Predict with best_th

In [26]:
best_th = best_th_function(X,y,grid_search_xgboost_bal.best_params_)
print('Final Best Threshold: {}'.format(best_th))

Best Threshold: 0.4528 with F-Score: 0.9322
Best Threshold: 0.442 with F-Score: 0.9327
Best Threshold: 0.4754 with F-Score: 0.9297
Best Threshold: 0.5731 with F-Score: 0.9268
Best Threshold: 0.4276 with F-Score: 0.9394
Best Threshold: 0.5882 with F-Score: 0.9267
Best Threshold: 0.4463 with F-Score: 0.9347
Best Threshold: 0.3703 with F-Score: 0.9349
Best Threshold: 0.3106 with F-Score: 0.9202
Best Threshold: 0.3598 with F-Score: 0.928
Final Best Threshold: 0.44460999999999995


In [27]:
test_predicted = model.predict_proba(df_test)[:,1]

In [28]:
test_predicted[test_predicted>best_th] = 1
test_predicted[test_predicted<=best_th] = 0

In [29]:
df_pred = pd.DataFrame(test_predicted, columns = ["target"])

In [30]:
df_pred.to_json("predicciones_b_best.json")

#### Predict th 0.5

In [31]:
test_predicted = model.predict_proba(df_test)[:,1]

In [32]:
test_predicted[test_predicted>0.5] = 1
test_predicted[test_predicted<=0.5] = 0

In [33]:
df_pred = pd.DataFrame(test_predicted, columns = ["target"])

In [34]:
df_pred.to_json("predicciones_b_05.json")

### PREDICT UNBALANCED

In [35]:
model = xgb.XGBClassifier(tree_method="hist", enable_categorical=True, **grid_search_xgboost.best_params_)
model.fit(X, y)

#### Predict with best_th - MODEL CHOSEN

In [48]:
best_th = best_th_function(X,y,grid_search_xgboost.best_params_)
print('Final Best Threshold: {}'.format(best_th))

Best Threshold: 0.7148 with F-Score: 0.9333
Best Threshold: 0.7091 with F-Score: 0.931
Best Threshold: 0.7632 with F-Score: 0.9284
Best Threshold: 0.6818 with F-Score: 0.9294
Best Threshold: 0.6066 with F-Score: 0.9443
Best Threshold: 0.7075 with F-Score: 0.934
Best Threshold: 0.6974 with F-Score: 0.9386
Best Threshold: 0.7021 with F-Score: 0.9386
Best Threshold: 0.5832 with F-Score: 0.9224
Best Threshold: 0.6798 with F-Score: 0.9308
Final Best Threshold: 0.68455


In [53]:
test_predicted = model.predict_proba(df_test)[:,1]

In [54]:
test_predicted[test_predicted>best_th] = 1
test_predicted[test_predicted<=best_th] = 0

In [55]:
df_pred = pd.DataFrame(test_predicted, columns = ["target"])

In [56]:
df_pred.to_json("predicciones_u_best.json")

#### Predict th 0.5

In [41]:
test_predicted = model.predict_proba(df_test)[:,1]

In [42]:
test_predicted[test_predicted>0.5] = 1
test_predicted[test_predicted<=0.5] = 0

In [43]:
df_pred = pd.DataFrame(test_predicted, columns = ["target"])

In [44]:
df_pred.to_json("predicciones_u_05.json")