# The Data Source Bank
------



In [2]:
# Imports and file upload
import pandas as pd
import numpy as np

df = pd.read_csv("/content/PD 2023 Wk 1 Input.csv")

## Initial exploration
-----

In [3]:
df.head()

Unnamed: 0,Transaction Code,Value,Customer Code,Online or In-Person,Transaction Date
0,DTB-716-679-576,1448,100001,2,20/03/2023 00:00:00
1,DS-795-814-303,7839,100001,2,15/11/2023 00:00:00
2,DSB-807-592-406,5520,100005,1,14/07/2023 00:00:00
3,DS-367-545-264,7957,100007,2,18/08/2023 00:00:00
4,DSB-474-374-857,5375,100000,2,26/08/2023 00:00:00


In [4]:
df.shape

(365, 5)

In [5]:
df.describe()

Unnamed: 0,Value,Customer Code,Online or In-Person
count,365.0,365.0,365.0
mean,4938.813699,100004.961644,1.528767
std,2940.973461,3.277237,0.499857
min,13.0,100000.0,1.0
25%,2321.0,100002.0,1.0
50%,4743.0,100005.0,2.0
75%,7427.0,100008.0,2.0
max,9967.0,100010.0,2.0


Notes:
- 365 records and 5 columns in this CSV
- Avg transaction value 4938, 200 more than median
- Standard deviation is quite large
- Not all bank codes are the same length
- Online / in person is discrete not continuous


In [6]:
df.dtypes

Transaction Code       object
Value                   int64
Customer Code           int64
Online or In-Person     int64
Transaction Date       object
dtype: object

In [7]:
df.isnull().sum()

Transaction Code       0
Value                  0
Customer Code          0
Online or In-Person    0
Transaction Date       0
dtype: int64

## Task 1: Split transaction Code
-----
- Split the Transaction Code to extract the letters at the start of the transaction code. These identify the bank who processes the transaction
- Rename the new field with the Bank code 'Bank'


In [9]:
newcols = df['Transaction Code'].str.split("-", n=1, expand=True)
newcols

Unnamed: 0,0,1
0,DTB,716-679-576
1,DS,795-814-303
2,DSB,807-592-406
3,DS,367-545-264
4,DSB,474-374-857
...,...,...
360,DTB,116-439-102
361,DS,849-981-514
362,DS,726-686-279
363,DS,551-937-380


In [10]:
df["Bank"] = newcols[0]
df["Transaction Code"] = newcols[1]
df

Unnamed: 0,Transaction Code,Value,Customer Code,Online or In-Person,Transaction Date,Bank
0,716-679-576,1448,100001,2,20/03/2023 00:00:00,DTB
1,795-814-303,7839,100001,2,15/11/2023 00:00:00,DS
2,807-592-406,5520,100005,1,14/07/2023 00:00:00,DSB
3,367-545-264,7957,100007,2,18/08/2023 00:00:00,DS
4,474-374-857,5375,100000,2,26/08/2023 00:00:00,DSB
...,...,...,...,...,...,...
360,116-439-102,6708,100001,1,29/01/2023 00:00:00,DTB
361,849-981-514,8500,100000,2,29/10/2023 00:00:00,DS
362,726-686-279,9455,100006,2,10/08/2023 00:00:00,DS
363,551-937-380,475,100002,1,11/10/2023 00:00:00,DS


## Task 2: Rename Online/Inperson
-----
- Rename the values in the Online or In-person field, Online of the 1 values and In-Person for the 2 values.

To do this task we can use the map function on the specific column.
We use a dictionary for the mapping with the key being the original value and the value being the new/ updated version.

In [11]:
df["Online or In-Person"] = df['Online or In-Person'].map({1:"Online", 2: "In-Person"})
df

Unnamed: 0,Transaction Code,Value,Customer Code,Online or In-Person,Transaction Date,Bank
0,716-679-576,1448,100001,In-Person,20/03/2023 00:00:00,DTB
1,795-814-303,7839,100001,In-Person,15/11/2023 00:00:00,DS
2,807-592-406,5520,100005,Online,14/07/2023 00:00:00,DSB
3,367-545-264,7957,100007,In-Person,18/08/2023 00:00:00,DS
4,474-374-857,5375,100000,In-Person,26/08/2023 00:00:00,DSB
...,...,...,...,...,...,...
360,116-439-102,6708,100001,Online,29/01/2023 00:00:00,DTB
361,849-981-514,8500,100000,In-Person,29/10/2023 00:00:00,DS
362,726-686-279,9455,100006,In-Person,10/08/2023 00:00:00,DS
363,551-937-380,475,100002,Online,11/10/2023 00:00:00,DS


## Task 3: Changing to day of the week
-----
- Change the date to be the day of the week

I did this by the following steps:

- Splitting the date column as the time value is not required.
- Putting the first of the new columns created by that in place of the transaction dat column.
- Then I could convert the column into a datetime datatype and use the dt.day_name() so that we could convert the dates to day names.

In [15]:
datecols = df['Transaction Date'].str.split(" ", n=1, expand=True)
df['Transaction Date'] = datecols[0]
df['Transaction Date'] = pd.to_datetime(df['Transaction Date'], format="%d/%m/%Y")
df

Unnamed: 0,Transaction Code,Value,Customer Code,Online or In-Person,Transaction Date,Bank
0,716-679-576,1448,100001,In-Person,2023-03-20,DTB
1,795-814-303,7839,100001,In-Person,2023-11-15,DS
2,807-592-406,5520,100005,Online,2023-07-14,DSB
3,367-545-264,7957,100007,In-Person,2023-08-18,DS
4,474-374-857,5375,100000,In-Person,2023-08-26,DSB
...,...,...,...,...,...,...
360,116-439-102,6708,100001,Online,2023-01-29,DTB
361,849-981-514,8500,100000,In-Person,2023-10-29,DS
362,726-686-279,9455,100006,In-Person,2023-08-10,DS
363,551-937-380,475,100002,Online,2023-10-11,DS


In [16]:
df['Transaction Date'] = df['Transaction Date'].dt.day_name()

In [17]:
df

Unnamed: 0,Transaction Code,Value,Customer Code,Online or In-Person,Transaction Date,Bank
0,716-679-576,1448,100001,In-Person,Monday,DTB
1,795-814-303,7839,100001,In-Person,Wednesday,DS
2,807-592-406,5520,100005,Online,Friday,DSB
3,367-545-264,7957,100007,In-Person,Friday,DS
4,474-374-857,5375,100000,In-Person,Saturday,DSB
...,...,...,...,...,...,...
360,116-439-102,6708,100001,Online,Sunday,DTB
361,849-981-514,8500,100000,In-Person,Sunday,DS
362,726-686-279,9455,100006,In-Person,Thursday,DS
363,551-937-380,475,100002,Online,Wednesday,DS


## Final Outputs
------

### Total Values of Transactions by each bank

Must contain 2 data fields and 3 rows of data.

In [26]:
output1 = df.groupby(['Bank']).sum().filter(['Value'])
output1

  output1 = df.groupby(['Bank']).sum().filter(['Value'])


Unnamed: 0_level_0,Value
Bank,Unnamed: 1_level_1
DS,653940
DSB,530489
DTB,618238


### Total Values by Bank, Day of the Week and Type of Transaction

In [34]:
output2 = df.groupby(['Bank', 'Transaction Date', 'Online or In-Person']).sum().filter(['Value'])
output2


  output2 = df.groupby(['Bank', 'Transaction Date', 'Online or In-Person']).sum().filter(['Value'])


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Value
Bank,Transaction Date,Online or In-Person,Unnamed: 3_level_1
DS,Friday,In-Person,58599
DS,Friday,Online,58731
DS,Monday,In-Person,42806
DS,Monday,Online,33563
DS,Saturday,In-Person,34867
DS,Saturday,Online,71357
DS,Sunday,In-Person,51301
DS,Sunday,Online,21761
DS,Thursday,In-Person,75582
DS,Thursday,Online,13337


In [35]:
output2.shape

(42, 1)

### Total Values by Bank and Customer Code
-----

In [38]:
output3 = df.groupby(['Bank', 'Customer Code']).sum().filter(['Value'])
output3

  output3 = df.groupby(['Bank', 'Customer Code']).sum().filter(['Value'])


Unnamed: 0_level_0,Unnamed: 1_level_0,Value
Bank,Customer Code,Unnamed: 2_level_1
DS,100000,57909
DS,100001,53063
DS,100002,69803
DS,100003,25482
DS,100004,63315
DS,100005,39668
DS,100006,77636
DS,100007,76190
DS,100008,56400
DS,100009,56581


In [39]:
output3.shape

(33, 1)