# Credit Card Churn - Logistic Regression with SMOTE-NC

### Data Preparation

In [None]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [None]:
working_directory = os.getcwd()

In [None]:
path = working_directory + '/data/BankChurners.csv'

In [None]:
bank_churn = pd.read_csv(path)

In [None]:
bank_churn.head()

In [None]:
bank_churn.info()

In [None]:
bank_churn.isnull().sum()

In [None]:
bank_churn.describe(include='all')

In [None]:
#dropping irrelevant columns
bank_churn1 = bank_churn.drop(['CLIENTNUM', 'Gender', 'Marital_Status', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis = 1)

In [None]:
#dummy variables creation
categorical = bank_churn1[['Attrition_Flag', 'Education_Level', 'Income_Category', 'Card_Category']]
dummies = pd.get_dummies(categorical, columns = ['Education_Level', 'Income_Category', 'Card_Category'], drop_first = True)
dummies['Attrition_Flag'] = dummies['Attrition_Flag'].apply(lambda x: 1 if x == 'Attrited Customer' else 0)
numeric = bank_churn1.drop(columns=['Attrition_Flag', 'Education_Level', 'Income_Category', 'Card_Category'])

In [None]:
#concatenating dummy and numeric variables
churn_concat = pd.concat([dummies, numeric], axis=1)

In [None]:
churn_concat.columns

In [None]:
#running correlation matrix
import seaborn as sns
import matplotlib.pyplot as plt

corr = numeric.corr()
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='PRGn', vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, fmt='.2f')

plt.show()

### Logistic Regression with SMOTE-NC

In [None]:
#importing libraries for logistic regression
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import warnings
warnings.filterwarnings('ignore')

In [None]:
#splitting data
x = churn_concat[['Education_Level_Doctorate', 'Education_Level_Graduate', 'Education_Level_High School',
    'Education_Level_Post-Graduate', 'Education_Level_Uneducated', 'Education_Level_Unknown',
    'Income_Category_$40K - $60K', 'Income_Category_$60K - $80K', 'Income_Category_$80K - $120K',
    'Income_Category_Less than $40K', 'Income_Category_Unknown', 'Card_Category_Gold',
    'Card_Category_Silver', 'Dependent_count', 'Total_Relationship_Count',
    'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit',
    'Total_Revolving_Bal', 'Total_Trans_Amt', 'Avg_Utilization_Ratio']]

#excluded due to high VIF: 'Customer_Age', 'Total_Trans_Ct', 'Months_on_book', 'Total_Amt_Chng_Q4_Q1', 
#'Total_Ct_Chng_Q4_Q1', 'Avg_Open_To_Buy', 'Card_Category_Platinum'

y = churn_concat['Attrition_Flag']

In [None]:
#VIF table
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_data = pd.DataFrame()
vif_data["feature"] = x.columns

#calculating VIF for each feature
vif_data['VIF'] = [variance_inflation_factor(x.values, i)
                  for i in range(len(x.columns))]
print(vif_data)

In [None]:
#splitting data into training and testing sets:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=1)

In [None]:
#utilizing SMOTE-NC to perform oversampling on training data 
#pip install -U imbalanced-learn
from imblearn.over_sampling import SMOTENC

In [None]:
smotenc = SMOTENC([1,2,3,4,5,6,7,8,9,10,11,12,13], random_state = 1)
X_oversample, y_oversample = smotenc.fit_resample(x_train, y_train)

In [None]:
#running the model on training data
lmodel = sm.Logit(y_oversample, X_oversample)
result = lmodel.fit()
print(result.summary())

In [None]:
#intercept
result = LogisticRegression().fit(X_oversample, y_oversample)
print(result.intercept_)

In [None]:
#odds ratio
np.exp(result.coef_)

In [None]:
#computing accuracy on test data
logreg = LogisticRegression(random_state=1)
logreg.fit(X_oversample, y_oversample)
y_pred = logreg.predict(x_test)
print('Logistic regression accuracy: {:.2f}'.format(logreg.score(x_test, y_test)))

In [None]:
#confusion matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [None]:
#confusion matrix (heat map)
royal_blue_palette = sns.light_palette("royalblue", as_cmap=True)
sns.heatmap(confusion_matrix, annot = True, fmt='d', xticklabels = ['No', 'Yes'], yticklabels = ['No', 'Yes'], cmap=royal_blue_palette)
plt.ylabel("True label", fontsize = 12)
plt.xlabel("Predicted label", fontsize = 12)

In [None]:
#classification report
print(classification_report(y_test, y_pred))

In [None]:
#ROC curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(x_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(x_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate - (1 - Specificity)')
plt.ylabel('True Positive Rate - Sensitivity')
plt.title('Receiver operating characteristic, the ROC Curve')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()