In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, log_loss
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score, mean_squared_error, mean_absolute_error
from sklearn.metrics import classification_report, log_loss

In [None]:
sns.set(style='darkgrid')

In [None]:
df = pd.read_csv('../input/credit-card-customers/BankChurners.csv')

# ***DATA DESCRIPTION***

In [None]:
df.sample(5)

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
# DROP IRRELEVANT COLUMN
df.drop(['CLIENTNUM', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 
         'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis=1, inplace=True )

In [None]:
df.info()

In [None]:
sns.heatmap(df.isna())

In [None]:
# Target feature is 'Attrition_Flag' in which 'Attrited Customer' means it's a churn (1) and 'Existing Customer' means there is no churn (0)

df['Attrition_Flag'] = df['Attrition_Flag'].map({'Attrited Customer': 1, 'Existing Customer': 0})

# ***Numerical and categorical***

In [None]:
df1 = df.copy()

In [None]:
df1.dtypes

In [None]:
#Select numerical and categorical

df_num = df1.select_dtypes(include=['int64', 'float64'])
df_cat = df1.select_dtypes(exclude=['int64', 'float64'])
df_num = df_num.iloc[:, 1:15]

In [None]:
df_num.shape

In [None]:
df_cat.sample()

In [None]:
df_num.sample()

# ***EXPLORATORY DATA ANALYSIS (EDA)***

In [None]:
df2 = df1.copy()

In [None]:
df2.head()

## ***TARGET VARIABLE***



In [None]:
plt.figure(figsize=(10,5))
g= sns.countplot(x='Attrition_Flag', data=df2);
for p in g.patches:
    height = p.get_height()
    g.text(p.get_x()+p.get_width()/2.,
            height/2,
            '{}'.format(height),
            ha="center", color='white')
plt.title('Count values to target variable')



*   As we can see, the base is unbalanced.



## ***NUMERICAL VARIABLE***

In [None]:
df_num.hist( bins=30, figsize=(15,15) );

## ***CATEGORICAL VARIABLE***



In [None]:
df_cat.sample()

In [None]:
df_cat['Income_Category'] = df_cat['Income_Category'].apply(lambda x:'< $40k' if x == 'Less than $40K' else x);

In [None]:
plt.figure(figsize=(15,7))

g = sns.countplot(x='Gender', data=df_cat);
for p in g.patches:
    height = p.get_height()
    g.text(p.get_x()+p.get_width()/2.,
            height/2,
            '{}'.format(height),
            ha="center", color='white')

In [None]:
plt.figure(figsize=(15,7))
g = sns.countplot(x='Education_Level',data=df_cat);
for p in g.patches:
    height = p.get_height()
    g.text(p.get_x()+p.get_width()/2.,
            height/2,
            '{}'.format(height),
            ha="center", color='white')

In [None]:
plt.figure(figsize=(15,7))
g= sns.countplot(x='Marital_Status', data=df_cat);
for p in g.patches:
    height = p.get_height()
    g.text(p.get_x()+p.get_width()/2.,
            height/2,
            '{}'.format(height),
            ha="center", color='white')

In [None]:
plt.figure(figsize=(15,5))
g= sns.countplot(x='Income_Category', data=df_cat);
for p in g.patches:
    height = p.get_height()
    g.text(p.get_x()+p.get_width()/2.,
            height/2,
            '{}'.format(height),
            ha="center", color='white')

In [None]:
plt.figure(figsize=(15,5))
plt.title('Number of Customers Inactive - Months_Inactive_12_mon')
g = sns.countplot(x = 'Months_Inactive_12_mon', data = df2)
for p in g.patches:
    height = p.get_height()
    g.text(p.get_x()+p.get_width()/2.,
            height/2,
            '{}'.format(height),
            ha="center", color='white')

In [None]:
plt.figure(figsize=[15,5])
sns.boxplot(x=df_num['Customer_Age'], y=df_cat['Gender']);
plt.xlabel('Age')

In [None]:
df2.sample()

## ***MULTIVARIATE ANALYSIS***

Pearson's coefficient ranges from -1 to 1. So:

r = 1 It means a perfect and positive correlation between the two variables.

A negative and perfect correlation between the two variables – that is, if one increases, the other always decreases returns an r = -1.

r = 0 It means that the two variables do not depend linearly on each other. However, there may be another dependency that is "non-linear". Thus, the result r = 0 must be investigated by other means.

In [None]:
correlations = df2.corr(method='pearson')

In [None]:
f, ax = plt.subplots(figsize = (15,15))
sns.heatmap(correlations, annot = True);

In [None]:
df2.sample()

In [None]:
plt.figure(figsize=[20,7])
sns.countplot(x='Customer_Age', hue='Attrition_Flag', data=df2);
plt.xlabel('Age');

**False**

In [None]:
plt.figure(figsize=[20,7])
ax = sns.countplot(x='Months_Inactive_12_mon', hue='Attrition_Flag', data=df2);
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
plt.xlabel('Months inactive');

In [None]:
df2['Income_Category'] = df2['Income_Category'].apply(lambda x: '< 40K' if x == 'Less than $40K' else x)

In [None]:
plt.figure(figsize=[20,7])
ax = sns.countplot(x='Income_Category', hue='Attrition_Flag', data=df2);
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
plt.xlabel('FAIXA SALARIAL');

In [None]:
df2.sample()

In [None]:
plt.figure(figsize=[20,7])
ax = sns.countplot(x='Contacts_Count_12_mon', hue='Attrition_Flag', data=df2);
plt.xlabel('Contacts');

In [None]:
plt.figure(figsize=(20,7))
ax = sns.countplot(x='Card_Category',hue=df2['Attrition_Flag'], data=df_cat);
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

# **BASE PRE-PRECESSING**

In [None]:
df_cat.sample()

In [None]:
df_cat_dummies = pd.get_dummies(df_cat)

In [None]:
df_cat_dummies.head()

In [None]:
df3 = pd.concat([df_num, df_cat_dummies], axis=1)

In [None]:
X = df3.copy()
y = df2['Attrition_Flag']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# **LOGISTIC REGRESSION**

In [None]:
modelLR = LogisticRegression()

In [None]:
modelLR.fit(X_train, y_train);

In [None]:
predictLR = modelLR.predict(X_test)

In [None]:
print('Logistic Regression: \n', classification_report(predictLR, y_test))
print('Logistic Regression Accuracy: ', accuracy_score(predictLR, y_test))

In [None]:
print('MAE:', mean_absolute_error(predictLR, y_test) )
print('MSE:', mean_squared_error(predictLR, y_test) )
print('RMSE:', np.sqrt(mean_squared_error(predictLR, y_test) ))

In [None]:
#Gerando matrix de confusao
cm = confusion_matrix(y_test, predictLR)
sns.heatmap(cm, annot=True);

# **RANDOM FOREST CLASSIFIER**

In [None]:
modelRF = RandomForestClassifier()

In [None]:
modelRF.fit(X_train, y_train)

In [None]:
predictRF = modelRF.predict(X_test)

In [None]:
print('Random Forest Classifier : \n', classification_report(predictRF, y_test))
print('Random Forest Classifier Accuracy: ', accuracy_score(predictRF, y_test))

In [None]:
print('MAE:', mean_absolute_error(predictRF, y_test) )
print('MSE:', mean_squared_error(predictRF, y_test) )
print('RMSE:', np.sqrt(mean_squared_error(predictRF, y_test) ))

In [None]:
# get confusion matrix
cm = confusion_matrix(y_test, predictRF)
sns.heatmap(cm, annot=True);

# **GRADIENT BOOSTING CLASSIFIER**

In [None]:
modelGB = GradientBoostingClassifier()

In [None]:
modelGB.fit(X_train, y_train)

In [None]:
predictGB = modelGB.predict(X_test)

In [None]:
print('Gradient Boost Classifier : \n', classification_report(predictGB, y_test))
print('Gradient Boost Classifier Accuracy: ', accuracy_score(predictGB, y_test))

In [None]:
print('MAE:', mean_absolute_error(predictGB, y_test) )
print('MSE:', mean_squared_error(predictGB, y_test) )
print('RMSE:', np.sqrt(mean_squared_error(predictGB, y_test) ))

In [None]:
#Gerando matrix de confusao
cm = confusion_matrix(y_test, predictGB)
sns.heatmap(cm, annot=True);

In [None]:
# get important features
plt.figure(figsize=(15,7))
feature_list = pd.Series(modelGB.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_list.nlargest(10).sort_values(ascending=True).plot(kind='barh')
plt.show()