In [4]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [2]:
df = pd.read_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\kaggle bank chrun\train.csv')

## Exploratory Data Analysis

Description Of Features
These are all the columns as I understand them.

- ID columns
    - **id:** Unique identifier.
    - **CustomerId:** Customer identifier.
    - **Surname:** Customer last name.

- Categorical features
    - **Geography:** Customer location.
    - **Gender:** Gender of customer.
    - **HasCrCard:** Whether or not the customer has a credit card with the bank.
    - **IsActiveMember:** Whether or not the customer has active transactions with the bank.

- Numerical features
    - **CreditScore:** Measure of how reliable a customer is as a borrower.
    - **Age:** Age of customer.
    - **Tenure:** Amount of time customer has been doing business with the bank.
    - **Balance:** Amount of cash the customer has with the bank.
    - **NumOfProducts:** Number of products purchased by the customer.
    - **EstimatedSalary:** Rough salary of the customer.
    
- Result to predict against
    - **Exited:** Whether or not the customer has left the bank.

In [3]:
df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [99]:
def get_perc(x, name, df_tot):
    return x['Count'] / df_tot[x[name]]

def get_label(x, var1):
    if x['Exited'] == 0:
        return f'{x[var1]} Current C.'
    else:
        return f'{x[var1]} Exited'
    

def df_for_plotting(var1, var2):
    df_plot = df.groupby([var1, var2])['Age'].count().to_frame().reset_index()
    df_plot.rename(columns={'Age':'Count'}, inplace=True)

    # Create a new col that has both var1 and var2, dtype=str.
    # So it's easier to plot
    df_plot[f'{var1}2'] = df_plot[var1].astype(str) + df_plot[var2].astype(str)

    # Create the 'Perc' col so it can be plotted later
    df_tot = df_plot.groupby(var1)['Count'].sum()
    df_plot['Perc'] = df_plot.apply(get_perc, name=var1,
                                    df_tot=df_tot, axis=1)
    
    df_plot['Label'] = df_plot.apply(get_label, var1=var1,
                                     axis=1)

    return df_plot

In [98]:
def plot_cat_var(df, var1):

    fig = go.Figure()

    fig = go.Figure(go.Bar(
        x=df['Count'][::-1], #the [::-1] ensures it's shown in a good order
        y=df[var1][::-1],
        orientation='h',
        marker_color=['#7FD4C1', '#F7C0BB'] * (df.shape[0] // 2)
    ))

    fig.update_layout(
        xaxis_title=dict(text='Date', font=dict(size=16, color='black')),
        yaxis_title=dict(text='7 day avg', font=dict(size=16, color='black')),
    )

    annotations = []
    for n in range(df.shape[0]):
        annotations.append(dict(xref='x', yref='y',
                            x=df.loc[n,'Count'] // 2, y=df.loc[n,var1],
                            text=str((df.loc[n,'Perc']*100).round(1)) + '%',
                            font=dict(family='Arial', size=14,
                                    color='black'),
                            showarrow=False))
    
    fig.update_layout(
    annotations=annotations,
    yaxis = dict(
        tickmode = 'array',
        tickvals = df[var1],
        ticktext = df['Label'].values,
        )
    )

    fig.show()

In [102]:
group_geo

Unnamed: 0,IsActiveMember,Exited,Count,IsActiveMember2,Perc,Label
0,0.0,0,58261,0.0,0.702914,0.0 Current C.
1,0.0,1,24624,0.01,0.297086,0.0 Exited
2,1.0,0,71852,1.0,0.874655,1.0 Current C.
3,1.0,1,10297,1.01,0.125345,1.0 Exited


In [101]:
group_geo = df_for_plotting('IsActiveMember', 'Exited')
plot_cat_var(group_geo, 'IsActiveMember2')