In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

In [2]:
df = pd.read_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\kaggle bank chrun\train.csv')

## Exploratory Data Analysis

Description Of Features
These are all the columns as I understand them.

- ID columns
    - **id:** Unique identifier.
    - **CustomerId:** Customer identifier.
    - **Surname:** Customer last name.

- Categorical features
    - **Geography:** Customer location.
    - **Gender:** Gender of customer.
    - **HasCrCard:** Whether or not the customer has a credit card with the bank.
    - **IsActiveMember:** Whether or not the customer has active transactions with the bank.

- Numerical features
    - **CreditScore:** Measure of how reliable a customer is as a borrower.
    - **Age:** Age of customer.
    - **Tenure:** Amount of time customer has been doing business with the bank.
    - **Balance:** Amount of cash the customer has with the bank.
    - **NumOfProducts:** Number of products purchased by the customer.
    - **EstimatedSalary:** Rough salary of the customer.
    
- Result to predict against
    - **Exited:** Whether or not the customer has left the bank.

In [3]:
df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


### Categotical Values

In [4]:
def get_perc(x, name, df_tot):
    return x['Count'] / df_tot[x[name]]

def get_label(x, var1):
    if x['Exited'] == 0:
        return f'{x[var1]} Current C.'
    else:
        return f'{x[var1]} Exited'
    

def df_for_plotting(var1, var2):
    df_plot = df.groupby([var1, var2])['Age'].count().to_frame().reset_index()
    df_plot.rename(columns={'Age':'Count'}, inplace=True)


    # Create a new col that has both var1 and var2, dtype=str.
    # So it's easier to plot
    df_plot[f'{var1}2'] = df_plot[var1].astype(str) + df_plot[var2].astype(str) + 'a'
    # There's a + 'a' because with only numbers plotly bugs

    # Create the 'Perc' col so it can be plotted later
    df_tot = df_plot.groupby(var1)['Count'].sum()
    df_plot['Perc'] = df_plot.apply(get_perc, name=var1,
                                    df_tot=df_tot, axis=1)
    
    df_plot['Label'] = df_plot.apply(get_label, var1=var1,
                                     axis=1)

    return df_plot

In [79]:
def plot_cat_var(df, var1, xtitle, ytitle,labels=None,):

    fig = go.Figure()

    fig = go.Figure(go.Bar(
        x=df['Count'][::-1], #the [::-1] ensures it's shown in a good order
        y=df[var1][::-1],
        orientation='h',
        marker_color=['#7FD4C1', '#F7C0BB'] * (df.shape[0] // 2)
    ))

    annotations = []
    for n in range(df.shape[0]):
        annotations.append(dict(xref='x', yref='y',
                            x=df.loc[n,'Count'] // 2, y=df.loc[n,var1],
                            text=str((df.loc[n,'Perc']*100).round(1)) + '%',
                            font=dict(family='Arial Black', size=14,
                                    color='white',),
                            showarrow=False))
    
    if not labels:
        labels = df['Label'].values

    fig.update_layout(

    annotations=annotations,
    xaxis_title=dict(text=xtitle, font=dict(size=20, color='black')),
    yaxis_title=dict(text=ytitle, font=dict(size=20, color='black')),

    autosize=False,
    width=960,
    height=540,
    margin=dict(l=20, r=20, t=60, b=20),
    #paper_bgcolor="LightSteelBlue",
    yaxis = dict(
        tickmode = 'array',
        tickvals = df[var1],
        ticktext = labels,
        )
    )

    fig.show()

In [80]:
has_c_labels = ['No Card<br>Curent C.', 'No Card<br>Exited', 'Has Card<br>Current C.', 'Has Card<br>Exited']

group_has_c = df_for_plotting('HasCrCard', 'Exited')
plot_cat_var(group_has_c, 'HasCrCard2', 'Count','Customer Type',has_c_labels)

In [83]:
geo_labels = ['Current French<br>Customers', 'Exited French<br>Customers',
              'Current German<br>Customers', 'Exited German<br>Customers',
              'Current Spanish<br>Customers', 'Exited Spanish<br>Customers',]

group_geo = df_for_plotting('Geography', 'Exited')
plot_cat_var(group_geo, 'Geography2', 'Count','Customer Nationality',geo_labels)

In [85]:
gender_labels = ['Current Female<br>Customers', 'Exited Female<br>Customers',
              'Current Male<br>Customers', 'Exited Male<br>Customers',]

group_gender = df_for_plotting('Gender', 'Exited')
plot_cat_var(group_gender, 'Gender2', 'Count','Customer Gender', gender_labels)

## Numerical Values

In [106]:
def plot_num_var(df, var1, annotat_position, xtitle,
                 round=1, colors=['rgba(26,150,65,0.5)','rgba(250,150,65,0.5)']):
    # annotat_position --> (label, x_position, y_position, color)
    # annotat_position --> list of tuples

    x_value = list(df.loc[df['Exited']==0,var1])
    x_value2 = list(df.loc[df['Exited']==1,var1])

    fig = ff.create_distplot([x_value,x_value2], ['Group1','Group2'],
                            show_hist=False, show_rug=False,
                            colors=[colors[0],colors[1],])

    y1 = fig.data[0].y
    x1 = fig.data[0].x
    y2 = fig.data[1].y
    x2 = fig.data[1].x

    fig.add_scatter(x=x1, y=y1,fill='tozeroy', mode='none',
                    fillcolor=colors[0],)
    fig.add_scatter(x=x2, y=y2,fill='tozeroy', mode='none',
                    fillcolor=colors[1],)

    annotations = []

    for n in range(len(annotat_position)):
        annotations.append(dict(xref='x', yref='y',
                            x=annotat_position[n][1],
                            y=annotat_position[n][2],
                            text=annotat_position[n][0],
                            font=dict(family='Arial Black', size=14,
                                    color=annotat_position[n][3]),
                            showarrow=False))

    fig.update_yaxes(
        tickformat=f'.{round}%',
    )

    fig.update_layout(
        showlegend=False,
        annotations=annotations,
        xaxis_title=dict(text=xtitle, font=dict(size=20, color='black')),
        yaxis_title=dict(text='Density', font=dict(size=20, color='black')),

        autosize=False,
        width=960,
        height=540,
        margin=dict(l=20, r=20, t=60, b=20),
        #paper_bgcolor="LightSteelBlue",
    )

    fig.show()

In [107]:
cred_labels = [('Current Clients', 725, 0.0055,'rgba(26,150,65,0.5)'),
               ('Former Clients', 780, 0.0032,'rgba(250,150,65,0.5)')]

plot_num_var(df,'CreditScore', cred_labels, 'Credit Score')

In [108]:
age_labels = [('Current Clients', 45, 0.05,'rgba(26,150,65,0.5)'),
               ('Former Clients', 55, 0.035,'rgba(250,150,65,0.5)')]

plot_num_var(df,'Age', age_labels,'Age')

In [114]:
balance_labels = [('Former Clients', 25000, 0.00003,'rgba(250,150,65,0.5)'),
               ('Current Clients', 125000, 0.000013,'rgba(26,150,65,0.5)')]

plot_num_var(df,'Balance',balance_labels,'Balance', round=4,
             colors=['rgba(250,150,65,0.5)','rgba(26,150,65,0.5)'])