In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures, DropCorrelatedFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, mean_squared_error, log_loss, recall_score
from xgboost import XGBClassifier

%matplotlib inline

In [3]:
train_data = pd.read_csv('train.csv')

In [4]:
train_data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [5]:
# Describe train dataset
train_data.describe()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
count,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,...,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0
mean,75964.050723,-1523.199277,33.212865,86.208265,72.363067,119.529632,3.55913,6.472698,0.412946,0.567352,...,7.935824,1.365146,12.21558,8.784074,31.505324,1.858575,76.026165,56.614351,117235.8,0.039569
std,43781.947379,39033.462364,12.956486,1614.757313,339.315831,546.266294,93.155749,153.737066,30.604864,36.513513,...,455.887218,113.959637,783.207399,538.439211,2013.125393,147.786584,4040.337842,2852.579397,182664.6,0.194945
min,1.0,-999999.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5163.75,0.0
25%,38104.75,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67870.61,0.0
50%,76043.0,2.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106409.2,0.0
75%,113748.75,2.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118756.3,0.0
max,151838.0,238.0,105.0,210000.0,12888.03,21024.81,8237.82,11073.57,6600.0,6600.0,...,50003.88,20385.72,138831.63,91778.73,438329.22,24650.01,681462.9,397884.3,22034740.0,1.0


In [6]:
#  Check if any null values in dataframe
train_data.isnull().sum().sum()

0

In [7]:
# Drop duplicate records
train_data = train_data.drop("ID", axis=1)
original_records = train_data.shape[0]

train_data = train_data.drop_duplicates()
print(f"Dropped {original_records - train_data.shape[0]} duplicate records")

Dropped 4807 duplicate records


In [8]:
# Count of class 0 and 1
train_data.TARGET.value_counts()

0    68398
1     2815
Name: TARGET, dtype: int64

In [9]:
# Removing constant value features (zero variance features) from the dataset
const_col = []
for col in train_data.columns:
  if train_data[col].unique().shape[0] <= 1:
    const_col.append(col)
print(f'Removing {len(const_col)} features with constant values')
train_data = train_data.drop(const_col, axis = 1)

Removing 34 features with constant values


In [10]:
X = train_data.drop(['TARGET'], axis = 1)
y = train_data['TARGET']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 0, stratify = y)

In [12]:
# Drop features that are exactly identical to each other
duplicates = DropDuplicateFeatures()

# find duplicated features in the train set
duplicates.fit(X_train)

In [13]:
## Columns that are exactly identical to each other
duplicates.duplicated_feature_sets_

[{'ind_var29_0', 'ind_var6_0'},
 {'ind_var29', 'ind_var6'},
 {'ind_var13_medio', 'ind_var13_medio_0'},
 {'ind_var18', 'ind_var18_0'},
 {'ind_var26', 'ind_var26_0'},
 {'ind_var25', 'ind_var25_0'},
 {'ind_var32', 'ind_var32_0'},
 {'ind_var34', 'ind_var34_0'},
 {'ind_var37', 'ind_var37_0'},
 {'ind_var39', 'ind_var40'},
 {'num_var29_0', 'num_var6_0'},
 {'num_var29', 'num_var6'},
 {'num_var13_medio', 'num_var13_medio_0'},
 {'num_var18', 'num_var18_0'},
 {'num_var26', 'num_var26_0'},
 {'num_var25', 'num_var25_0'},
 {'num_var32', 'num_var32_0'},
 {'num_var34', 'num_var34_0'},
 {'num_var37', 'num_var37_0'},
 {'num_var39', 'num_var40'},
 {'saldo_var29', 'saldo_var6'},
 {'saldo_medio_var13_medio_ult1', 'saldo_var13_medio'},
 {'delta_imp_reemb_var13_1y3', 'delta_num_reemb_var13_1y3'},
 {'delta_imp_reemb_var17_1y3', 'delta_num_reemb_var17_1y3'},
 {'delta_imp_reemb_var33_1y3', 'delta_num_reemb_var33_1y3'},
 {'delta_imp_trasp_var17_in_1y3', 'delta_num_trasp_var17_in_1y3'},
 {'delta_imp_trasp_var17_o

In [14]:
print('Number of variables before removing duplicates: ', X_train.shape[1])

X_train = duplicates.transform(X_train)
X_test = duplicates.transform(X_test)

print('Number of variables after removing duplicates: ', X_train.shape[1])

Number of variables before removing duplicates:  335
Number of variables after removing duplicates:  306


In [15]:
# Identify categorical columns based on unique values. All categorical features would have value as [0,1] only
i= 0 
categorical_columns =[]
for col in X_train.columns:
  uniques = X_train[col].unique()
  if all([unique in [0,1] for unique in uniques]):
    i+=1
    categorical_columns.append(col)
len(categorical_columns)

56

In [16]:
X_train_cat = X_train[categorical_columns]
X_test_cat = X_test[categorical_columns]

X_train_num = X_train.drop(categorical_columns, axis=1)
X_test_num = X_test.drop(categorical_columns, axis=1)

In [17]:
#Remove features with >95% correlation in numerical dataset
correlated = DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.95)

# find correlated variables in the train set
correlated.fit(X_train_num)

len(correlated.features_to_drop_)

62

In [18]:
print('Number of variables before removing correlated: ', X_train_num.shape[1])

X_train_num = correlated.transform(X_train_num)
X_test_num = correlated.transform(X_test_num)

print('Number of variables after removing correlated: ', X_train_num.shape[1])

Number of variables before removing correlated:  250
Number of variables after removing correlated:  188


In [19]:
#Scale Numerical data
scaler = StandardScaler()

scaled_train_num = scaler.fit_transform(X_train_num)
X_train_num = pd.DataFrame(scaled_train_num, columns=X_train_num.columns)

scaled_test_num = scaler.transform(X_test_num)
X_test_num = pd.DataFrame(scaled_test_num, columns=X_test_num.columns)

In [20]:
X_train_cat.shape, X_train_num.shape, X_test_num.shape, X_test_cat.shape

((56970, 56), (56970, 188), (14243, 188), (14243, 56))

In [21]:
# Join Categorical and numeric features
X_train = pd.concat([X_train_cat.reset_index(drop=True), X_train_num.reset_index(drop=True)], axis=1)
X_test = pd.concat([X_test_cat.reset_index(drop=True), X_test_num.reset_index(drop=True)], axis=1)

In [22]:
## Drop Sparse Columns. ie. columns having 0 in more than 99% of the observations
i=0
sparse_columns = []
for col in X_train.columns: #removing all sparse features
    if np.percentile(X_train[col],99)==0:
      sparse_columns.append(col)
X_train = X_train.drop(sparse_columns, axis=1)
X_test = X_test.drop(sparse_columns, axis=1)

print(f"dropped {len(sparse_columns)} sparse columns")

dropped 23 sparse columns


In [23]:
## Oversampling using SMOTE
oversample = SMOTE()
X_over, y_over = oversample.fit_resample(X_train, y_train)

In [24]:
X_over.shape, y_over.shape

((109436, 221), (109436,))

In [25]:
y_over.value_counts()

0    54718
1    54718
Name: TARGET, dtype: int64

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import roc_auc_score, classification_report, f1_score, recall_score, roc_curve, auc

In [27]:
o_log_model = LogisticRegression(random_state = 0, max_iter = 10000)
o_log_model.fit(X_over, y_over)

In [28]:
y_pred = o_log_model.predict(X_test)
o_log_model_accuracy = accuracy_score(y_test, y_pred)
o_log_model_auc = roc_auc_score(y_test, o_log_model.predict_proba(X_test)[:,1])
o_log_model_recall = recall_score(y_test, y_pred)
o_log_model_f1 = f1_score(y_test, y_pred)

print(f'''Logistic Regression model trained using oversampled data 
accuracy = {o_log_model_accuracy:.3f}
auc_score = {o_log_model_auc:.3f}
recall = {o_log_model_recall:.3f}
f1 = {o_log_model_f1:.3f}''')

Logistic Regression model trained using oversampled data 
accuracy = 0.726
auc_score = 0.779
recall = 0.726
f1 = 0.173


In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.73      0.84     13680
           1       0.10      0.73      0.17       563

    accuracy                           0.73     14243
   macro avg       0.54      0.73      0.50     14243
weighted avg       0.95      0.73      0.81     14243



In [30]:
#Fit LR Model on imbalanced data
log_model = LogisticRegression(random_state = 0, max_iter = 10000)
log_model.fit(X_train, y_train)

In [31]:
y_pred = log_model.predict(X_test)
log_model_accuracy = accuracy_score(y_test, y_pred)
log_model_auc = roc_auc_score(y_test, log_model.predict_proba(X_test)[:,1])
log_model_recall = recall_score(y_test, y_pred)
log_model_f1 = f1_score(y_test, y_pred)

print(f'''Logistic Regression model trained on imbalanced data 
accuracy = {log_model_accuracy:.3f}
auc_score = {log_model_auc:.3f}
recall = {log_model_recall:.3f}
f1 = {log_model_f1:.3f}''')

Logistic Regression model trained on imbalanced data 
accuracy = 0.960
auc_score = 0.793
recall = 0.005
f1 = 0.010


In [32]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from tqdm import tqdm

In [None]:
from collections import OrderedDict
clf = BalancedRandomForestClassifier(warm_start=True, 
                              oob_score=True,
                              min_samples_leaf=40,
                              max_depth = 10,
                              n_jobs=-1,
                              random_state=24)

error_rate = {}

estimators = [800, 900, 1000, 1100, 1500, 2000]
for i in tqdm(estimators):
    clf.set_params(n_estimators=i) 
    clf.fit(X_over, y_over)

    # Record the OOB error for each `n_estimators=i` setting.
    oob_error = 1 - clf.oob_score_
    error_rate[i] = oob_error

























































































































































































In [None]:
xs = []
ys = []
for label, clf_err in error_rate.items():
    xs.append(label)
    ys.append(clf_err)   
plt.plot(xs, ys)
# plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.show();

In [None]:
best_n_estimators = min(error_rate, key=error_rate.get)
best_n_estimators

In [None]:
from collections import OrderedDict
clf = BalancedRandomForestClassifier( 
                              oob_score=True,
                              n_estimators = best_n_estimators,
                              n_jobs=-1,
                              random_state=24)

error_rate = {}
recall_scores = []
f1_scores = []

depths = [5,8,10,15,20,30,50,100]
for i in tqdm(depths):
    clf.set_params(max_depth=i) 
    clf.fit(X_over, y_over)
    y_pred = clf.predict(X_test)
    recall_scores.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))

    # Record the OOB error for each `n_estimators=i` setting.
    oob_error = 1 - clf.oob_score_
    error_rate[i] = oob_error
    print(oob_error)

In [None]:
xs = []
ys = []
for label, clf_err in error_rate.items():
    xs.append(label)
    ys.append(clf_err)  
fig, ax = plt.subplots(1,3, figsize=(10,5)) 
ax[0].plot(xs, ys, label="OOB error")
ax[0].set_ylabel("OOB error")
ax[0].set_xlabel("max_depth")
ax[0].legend()

ax[1].plot(xs, recall_scores, label="Recall score")
ax[1].set_ylabel("Recall")
ax[1].set_xlabel("max_depth")
ax[1].legend()

ax[2].plot(xs, f1_scores, label="F1 score")
ax[2].set_ylabel("F1 score")
ax[2].set_xlabel("max_depth")
ax[2].legend()

# plt.xlim(min_estimators, max_estimators)
plt.show();


In [None]:
best_n_estimators = 1100
best_max_depth = 30

rf_model = BalancedRandomForestClassifier( 
                              n_estimators = best_n_estimators,
                              max_depth = best_max_depth,
                              n_jobs=-1,
                              random_state=24)
rf_model.fit(X_over, y_over)

In [None]:
y_pred = rf_model.predict(X_test)
rf_model_accuracy = accuracy_score(y_test, y_pred)
rf_model_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:,1])
rf_model_recall = recall_score(y_test, y_pred)
rf_model_f1 = f1_score(y_test, y_pred)

print(f'''Random Forest Model 
accuracy = {rf_model_accuracy:.3f}
auc_score = {rf_model_auc:.3f}
recall = {rf_model_recall:.3f}
f1 = {rf_model_f1:.3f}''')

In [None]:
from sklearn.inspection import permutation_importance

In [None]:
%%time
perm_importance = permutation_importance(rf_model, X_test, y_test)

In [None]:
plt.rc('font', size=4)
plt.figure(figsize=(5,5), dpi=300)
sorted_idx = np.absolute(perm_importance.importances_mean).argsort()[-50:]
plt.barh(X_test.columns[sorted_idx], np.absolute(perm_importance.importances_mean[sorted_idx]))
plt.xlabel("Permutation Importance")
plt.savefig("Permutation Importance.png")
plt.show()

In [None]:
X_test.columns[sorted_idx]

In [None]:
XGB_clf = XGBClassifier(random_state=2022, max_depth = 10, sub_sample = 0.4, gamma = 10)
XGB_clf.fit(X_over, y_over)

In [None]:
y_pred = XGB_clf.predict(X_test)
xgb_model_accuracy = accuracy_score(y_test, y_pred)
xgb_model_auc = roc_auc_score(y_test, XGB_clf.predict_proba(X_test)[:,1])
xgb_model_recall = recall_score(y_test, y_pred)
xgb_model_f1 = f1_score(y_test, y_pred)

print(f'''XGBoost Model 
accuracy = {xgb_model_accuracy:.3f}
auc_score = {xgb_model_auc:.3f}
recall = {xgb_model_recall:.3f}
f1 = {xgb_model_f1:.3f}''')

In [None]:
sorted_idx = np.absolute(perm_importance.importances_mean).argsort()[-50:]
top_50_features = X_test.columns[sorted_idx]

In [None]:
X_over_top = X_over[top_50_features]
X_test_top = X_test[top_50_features]

In [None]:
#Fit LR Model on oversampled data
top_log_model = LogisticRegression(random_state = 0, max_iter = 10000)
top_log_model.fit(X_over_top, y_over)

In [None]:
y_pred = top_log_model.predict(X_test_top)
top_log_model_accuracy = accuracy_score(y_test, y_pred)
top_log_model_auc = roc_auc_score(y_test, top_log_model.predict_proba(X_test_top)[:,1])
top_log_model_recall = recall_score(y_test, y_pred)
top_log_model_f1 = f1_score(y_test, y_pred)

print(f'''Logistic Regression model trained using oversampled data 
accuracy = {top_log_model_accuracy:.3f}
auc_score = {top_log_model_auc:.3f}
recall = {top_log_model_recall:.3f}
f1 = {top_log_model_f1:.3f}''')

In [None]:
top_XGB_clf = XGBClassifier(random_state=2022, max_depth = 10, sub_sample = 0.4, gamma = 10)
top_XGB_clf.fit(X_over_top, y_over)

In [None]:
y_pred = top_XGB_clf.predict(X_test_top)
top_xgb_model_accuracy = accuracy_score(y_test, y_pred)
top_xgb_model_auc = roc_auc_score(y_test, top_XGB_clf.predict_proba(X_test_top)[:,1])
top_xgb_model_recall = recall_score(y_test, y_pred)
top_xgb_model_f1 = f1_score(y_test, y_pred)

print(f'''XGBoost Model 
accuracy = {top_xgb_model_accuracy:.3f}
auc_score = {top_xgb_model_auc:.3f}
recall = {top_xgb_model_recall:.3f}
f1 = {top_xgb_model_f1:.3f}''')

# Conclusion

Accuracy isn't the best parameter to evaluate the model performance. Since accuracy doesn't factor in class imbalances.

Better metrics to evaluate the model are the f1-score and AUC score. From these metrics we can see that the XGBoost classifier has performed the best followed by Random Forest and Logistic Regresion.


**XGBoost Classifier**

Accuracy =  0.899 

F1 Score = 0.241

Recall = 0.403

AUC score = 0.811
