In [None]:
import pandas as pd
import numpy as np
import datetime
from datetime import date, timedelta
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots 
pio.renderers.default='notebook'
pio.templates.default = "plotly_white"
import ipywidgets as widgets

In [None]:
customer = pd.read_csv('BankCustomersUK.csv')

In [None]:
customer.info()
customer.head()

Right away, I see some issues with this dataset.

Firstly, the dtype of the date column is wrong.

Secondly, there are some columns are irrelevant and will need to be deleted.

We will also need to check for null values. However, apart from that the data looks pretty neat and doesnt really need too much cleaning.

In [None]:
#converting to date
customer['Date Joined'] = pd.to_datetime(customer['Date Joined'])
customer['Date Joined']

In [None]:
#to confirm dtype change
customer.info()

In [None]:
#rename the job classification column to something shorter
customer.rename(columns={"Job Classification": "Job Class"}, inplace=True)
customer.head(1)

In [None]:
#delete irrelevat columns
customer = customer.drop(['Customer ID', 'Name', 'Surname'], axis=1)

In [None]:
#confirm deletion
customer.head(1)

Sorting the values of age from ascending to descending 

In [None]:
customer.sort_values(by='Age', inplace = True)

Lets look at the relationship between Account balance and job class

In [None]:
customer2 = customer.groupby('Job Class')['Balance'].sum()
fig = px.bar(customer2, x = customer2.index, y = 'Balance', title = 'Account balance per job class')
fig.update_traces(marker_color=['blue', 'aquamarine', 'silver'])
fig.show('notebook')

Observations: 

1. White collar: White collar workers have the highest bank balances with a total value of over £78M. 
2. Blue collar: The blue collars have the next highest bank balances with over £41M.
3. Other: The other group has the lowest total account balances with a little above £40M.


Just because a group has the largest bank balance doesnt mean that they are the largest group by population. The could just have richer people. Is that the case in this scenario? Next I'll do a donut chart to confirm this

In [None]:
Job = customer['Job Class'].value_counts()
label = Job.index
counts = Job.values
colors = ['silver', 'blue', 'aquamarine']
fig = go.Figure(data=[go.Pie(labels=label, values=counts)])
fig.update_layout(title_text='Job Class distribution', annotations=[dict(text='Group: Job Class', x=0.5, y=0.5, font_size=20, showarrow=False)])
fig.update_traces(hole= .6, hoverinfo='label+value', textinfo='percent', textfont_size=30,
                  marker=dict(colors=colors, line=dict(color='black', width=3)))
fig.show('notebook')

Observations:

1. White collar: The largest segment of account openers are the white collars which make up almost half of all the accounts opened for that year (48.7%)
2. Blue Collar: The blue collars have the scond largest percentage of account openers with 26.1%
3. Other: The others follow the blue collars closesly with a 0.9% difference in the share percentage (25.2%)

From the above, it is clear to see that the biggest customer base is the white collar group so it would make sense to design financial products that will cater to that demography of customers which are most likely salary earners. 

In [None]:
region = customer.groupby('Region')['Balance'].sum()
fig = px.bar(region, x = region.index, y = 'Balance', title = 'Account balance per region')
fig.update_traces(marker_color=['teal', 'lime', 'skyblue', 'cyan'])
fig.show('notebook')

Observations:

The region with the highest account balance is England £84.83M, followed by scotland £44.4M, Wales £22.04M and Nothern Ireland respectively £8.3M.  

Like I said above, just because a region has the largest bank balance doesnt mean that region is the most populous. We do a donut chart to confirm this


In [None]:
Region = customer['Region'].value_counts()
label = Region.index
counts = Region.values
colors = ['teal','skyblue', 'cyan', 'lime']
fig = go.Figure(data=[go.Pie(labels=label, values=counts)])
fig.update_layout(title_text='Regions', annotations=[dict(text='Group: Regions', x=0.5, y=0.5, font_size=20, showarrow=False)])
fig.update_traces(hole= .6, hoverinfo='label+value', textinfo='percent', textfont_size=30,
                  marker=dict(colors=colors, line=dict(color='black', width=3)))
fig.show('notebook')

Observations:
    It has been rightfuly noted above that England is the region with the most accounts opened, followed by Scotland, Wales and Northern Ireland
Next, Relationship between Age and account balance

In [None]:
age = customer.groupby('Age')['Balance'].sum()
fig = px.bar(age, x = age.index, y = 'Balance', title = 'Account balance per age', color = age.index)

fig.show('notebook')

Observations

1. We can see that the segment of customers with the highest account balances is the Age group ranging from the late 20's to the mid 40's. Ages 17 to 27 and ages 48 and above are the group segments with a low account balance. This information is useful in knowing which age range to focus marketing efforts on. It can also help in the research stage of a product design.   

Next, lets checkout the gender distribution

In [None]:
Gender = customer['Gender'].value_counts()
label = Gender.index
counts = Gender.values
colors = ['coral', 'cornsilk']
fig = go.Figure(data=[go.Pie(labels=label, values=counts)])
fig.update_layout(title_text='Gender distribution', annotations=[dict(text='Group: Gender', x=0.5, y=0.5, font_size=20, showarrow=False)])
fig.update_traces(hole= .6, hoverinfo='label+value', textinfo='percent', textfont_size=30,
                  marker=dict(colors=colors, line=dict(color='black', width=3)))
fig.show('notebook')

The gender distribution is almost equal with just a 7.8% difference.

In [None]:
date1 = customer.groupby('Date Joined')['Balance'].count().rename('No. of accounts opened').reset_index()
fig = px.line(date1, x= 'Date Joined', y='No. of accounts opened', title= 'Date joined distribution')
fig.show('notebook')

We can see the account opening was slow during the begining of the year but gradually increases as the year progresses.