In [10]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, cross_val_score,KFold
from sklearn.metrics import f1_score,confusion_matrix,roc_curve,roc_auc_score,accuracy_score


from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [11]:
#! pip install pandas
#! pip install seaborn
#! pip install imblearn
#! pip install xgboost

In [12]:
df_train=pd.read_csv('cs-training.csv').iloc[:,1:]
df_test=pd.read_csv('cs-test.csv').iloc[:,1:].drop('SeriousDlqin2yrs',axis=1)

df_train.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [13]:
# Data Cleaning
print(f'Training set has {df_train.shape[0]} rows and {df_train.shape[1]} columns')
print(f'Test set has {df_test.shape[0]} rows and {df_test.shape[1]} columns')

Training set has 150000 rows and 11 columns
Test set has 101503 rows and 10 columns


In [14]:
np.round(100*df_train.isnull().sum()/len(df_train),2)

SeriousDlqin2yrs                         0.00
RevolvingUtilizationOfUnsecuredLines     0.00
age                                      0.00
NumberOfTime30-59DaysPastDueNotWorse     0.00
DebtRatio                                0.00
MonthlyIncome                           19.82
NumberOfOpenCreditLinesAndLoans          0.00
NumberOfTimes90DaysLate                  0.00
NumberRealEstateLoansOrLines             0.00
NumberOfTime60-89DaysPastDueNotWorse     0.00
NumberOfDependents                       2.62
dtype: float64

In [15]:
# Missing value imputation
income_med=df_train['MonthlyIncome'].median()
df_train['MonthlyIncome']=df_train['MonthlyIncome'].fillna(income_med)
df_test['MonthlyIncome']=df_test['MonthlyIncome'].fillna(income_med)

df_train['NumberOfDependents']=df_train['NumberOfDependents'].fillna(0)
df_test['NumberOfDependents']=df_test['NumberOfDependents'].fillna(0)

In [16]:
# Model Building
X,y=df_train.drop('SeriousDlqin2yrs',axis=1),df_train['SeriousDlqin2yrs']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=42)
print('training set',[X_train.shape,y_train.shape])
print('test set',[X_test.shape,y_test.shape])

training set [(135000, 10), (135000,)]
test set [(15000, 10), (15000,)]


In [17]:
sm=SMOTE(random_state=21)
X_train,y_train=sm.fit_resample(X_train,y_train)

In [18]:
y_train.value_counts()

0    125935
1    125935
Name: SeriousDlqin2yrs, dtype: int64

In [19]:
# Scaling
sc=StandardScaler()
X_train_scaled=sc.fit_transform(X_train)
X_test_scaled=sc.transform(X_test)
df_test_scaled=sc.transform(df_test)

In [20]:
# Training best xgb model
xgb1=XGBClassifier(subsample=0.6,reg_lambda=0.1,
                  reg_alpha=0.01,min_child_weight=3,
                  max_depth=7,learning_rate=0.1,
                  gamma=0.1,colsample_bytree=0.6)

xgb1.fit(X_train_scaled,y_train)
pred=xgb1.predict(X_test_scaled)
print('F1 score:',f1_score(y_test,pred))



F1 score: 0.361532899493854


In [21]:
# generate pickle file
import pickle
f1=open('xgb_model.pkl','wb')
pickle.dump(xgb1,f1)
f1.close()

In [30]:
xgb1.predict(X_test_scaled[0].reshape(1,10))

array([0], dtype=int64)