# PA003: Churn Predict

# 0.0 Import

In [55]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import inflection

from IPython.core.display          import HTML
from scipy.stats import shapiro, chi2_contingency



import warnings

warnings.filterwarnings("ignore")

## 0.1.Helper function

In [56]:
def my_settings():
    %matplotlib inline
    
    # plotly settings
    plt.style.use( 'ggplot' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 8
 
    # notebook settings
    display(HTML('<style>.container{width:100% !important;}</style>'))
    np.set_printoptions(suppress=True)
    pd.set_option('display.float_format', '{:.8f}'.format)
    
    # seaborn settings
    sns.set(rc={'figure.figsize':(25,12)})
    sns.set_theme(style = 'darkgrid', font_scale = 1)
my_settings()

In [57]:
 def numerical_descriptive_statistical(num_attributes):
    # Central Tendency - Mean, Median
    ct1 = pd.DataFrame(num_attributes.apply(np.mean)).T
    ct2 = pd.DataFrame(num_attributes.apply(np.median)).T

    # Dispersion - std, min, max, range, skew, kurtosis, Shapiro-Wilk Test
    d1 = pd.DataFrame(num_attributes.apply(np.std)).T
    d2 = pd.DataFrame(num_attributes.apply(min)).T
    d3 = pd.DataFrame(num_attributes.apply(max)).T
    d4 = pd.DataFrame(num_attributes.apply(lambda x: x.max() - x.min())).T
    d5 = pd.DataFrame(num_attributes.apply(lambda x: x.skew())).T
    d6 = pd.DataFrame(num_attributes.apply(lambda x: x.kurtosis())).T
    d7 = pd.DataFrame(num_attributes.apply(lambda x: 'not normal' if shapiro(x.sample(5000))[1] < 0.05 else 'normal')).T

    # concatenate
    m = pd.concat([d2, d3, d4, ct1, ct2, d1, d5, d6, d7]).T.reset_index()
    m.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis', 'shapiro']
    return m

## 0.2. Loading Data

In [58]:
df_raw = pd.read_csv(r'~/repositorio/churn_predict/data/raw/churn.csv')

In [59]:
df_raw.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# 1.0. Data Description


In [60]:
df1 = df_raw.copy()

In [61]:
df1.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

## 1.1 Rename Columns

In [62]:
old_columns=list(df1.columns)

snakecase = lambda x : inflection.underscore(x)
new_columns = map(snakecase , old_columns)

# rename columns
df1.columns = new_columns


## 1.2. Data Dimensions

In [63]:
print('Numbers of rows: {}'.format(df1.shape[0]))
print('Numbers of cols: {}'.format(df1.shape[1]))

Numbers of rows: 10000
Numbers of cols: 14


## 1.3. Data Types

In [64]:
df1.dtypes

row_number            int64
customer_id           int64
surname              object
credit_score          int64
geography            object
gender               object
age                   int64
tenure                int64
balance             float64
num_of_products       int64
has_cr_card           int64
is_active_member      int64
estimated_salary    float64
exited                int64
dtype: object

### 1.3.1. Change Data Types

In [65]:
# df1.exited = df1.exited.astype('bool')
df1.has_cr_card = df1.has_cr_card.astype('bool')
df1.is_active_member= df1.is_active_member.astype('bool')

### 1.3.2. Check unique values

In [66]:
df1.nunique()

row_number          10000
customer_id         10000
surname              2932
credit_score          460
geography               3
gender                  2
age                    70
tenure                 11
balance              6382
num_of_products         4
has_cr_card             2
is_active_member        2
estimated_salary     9999
exited                  2
dtype: int64

### 1.3.3. Remove Variables

In [67]:
cols_drop = ['row_number', 'surname', 'customer_id']
df1 = df1.drop(cols_drop , axis = 1)

## 1.4. Check NA

In [68]:
df1.isna().sum()

credit_score        0
geography           0
gender              0
age                 0
tenure              0
balance             0
num_of_products     0
has_cr_card         0
is_active_member    0
estimated_salary    0
exited              0
dtype: int64

## 1.5. Data Descriptive

In [69]:
num_attributes = df1.select_dtypes(include=['int64', 'float64'])
cat_attributes = df1.select_dtypes(exclude=['int64', 'float64','bool'])

### 1.5.1 Numerical Attributes


In [70]:
m = numerical_descriptive_statistical(num_attributes)
m

Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis,shapiro
0,credit_score,350.0,850.0,500.0,650.5288,652.0,96.64846595,-0.07160661,-0.42572568,not normal
1,age,18.0,92.0,74.0,38.9218,37.0,10.48728205,1.01132026,1.39534706,not normal
2,tenure,0.0,10.0,10.0,5.0128,5.0,2.89202976,0.01099146,-1.16522523,not normal
3,balance,0.0,250898.09,250898.09,76485.889288,97198.54,62394.28525413,-0.14110871,-1.48941177,not normal
4,num_of_products,1.0,4.0,3.0,1.5302,1.0,0.58162527,0.74556789,0.58298076,not normal
5,estimated_salary,11.58,199992.48,199980.9,100090.239881,100193.915,57507.61722117,0.00208536,-1.18151845,not normal
6,exited,0.0,1.0,1.0,0.2037,0.0,0.40274845,1.47161066,0.16567104,not normal


# 2.0. Feature Engineering


In [71]:
df2 = df1.copy()

# 3.0. Data Filtering


In [72]:
df3 = df2.copy()

# 4.0. Exploratoria Data Analysis (EDA)


In [73]:
df4 = df3.copy()

# 5.0. Data Preparation


In [74]:
df5 = df4.copy()

# 6.0. Feature Selection


In [75]:
df6 = df5.copy()

# 7.0. Machine Learning Modelling


In [76]:
df7 = df6.copy()

# 8.0. Performance Metrics


In [77]:
df8 = df7.copy()

# 9.0. Deploy to Production


In [78]:
df9 = df8.copy()