### 2 Clean Data

In [2]:
import pandas as pd
import numpy as np
import statistics


In [3]:
bt=pd.read_csv("bank_transactions.csv")      # Import the dataset
bt.dropna(inplace=True)                      # Dropping missing values 

In [4]:
# convert to timedate
bt.CustomerDOB = pd.to_datetime(bt.CustomerDOB)
bt.TransactionDate = pd.to_datetime(bt.TransactionDate)

In [5]:
def zeros(x):
    l = len(x)
    if l != 6:
        x = '0' * (6-l) + x
    return x

In [6]:
bt.TransactionTime = list(map(str,bt.TransactionTime))
bt.TransactionTime = list(map(zeros,bt.TransactionTime))
bt.TransactionTime = bt.TransactionTime.apply(lambda x: f'{x[:2]}:{x[2:4]}:{x[4:]}')
bt.TransactionTime = pd.to_datetime(bt.TransactionTime, format='%H:%M:%S')

In [7]:
bt.loc[bt.CustomerDOB.dt.year > 2000, 'CustomerDOB'] = bt.loc[bt.CustomerDOB.dt.year > 2000, 'CustomerDOB'] - pd.DateOffset(years = 100)
bt.drop(bt[bt.CustomerDOB.dt.year == 1800].index, axis=0, inplace=True)

In [8]:
#calculating customer age
bt['CustomerAge'] = (( pd.to_datetime('today') - bt.CustomerDOB ) / np.timedelta64(1, 'Y')).round(0)
bt['CustomerAge'] = bt['CustomerAge'].astype(int)

In [9]:
bt['TransactionAmount (INR)'] = bt['TransactionAmount (INR)'].astype(int)
bt['Utilisation'] = bt['CustAccountBalance'] - bt['TransactionAmount (INR)']

In [10]:
#to save
bt.to_pickle("bank.pkl")

In [11]:
#to load
bt = pd.read_pickle("bank.pkl")


### 2.1 Getting your data + feature engineering

In [12]:
#Define an empty dataframe
Features = pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)

# Number of transaction
Features["Num_of_Transaction"] = bt[["CustomerID","TransactionID"]].groupby("CustomerID").count()

#Transaction with balance > 100:
Features["Balance>100_USD"] = bt[["CustomerID","CustAccountBalance"]].groupby("CustomerID")["CustAccountBalance"].apply(lambda x: (x>100).sum())


#Average Amount of the transactions:
Features["Average_Amount"] = bt[["CustomerID","TransactionAmount (INR)"]].groupby("CustomerID").agg(Average_Amount=("TransactionAmount (INR)",np.mean))


#Average Balance:
Features["Average_Balance"] = bt[["CustomerID","CustAccountBalance"]].groupby("CustomerID").agg(Average_Balance=("CustAccountBalance",np.mean))


#Utilisation = Average difference between the balance and the transaction amount:
pd.options.mode.chained_assignment = None      #Avoid the red banner
bt["Utilisation"] = bt["CustAccountBalance"]-bt["TransactionAmount (INR)"]  #create "Utilisation" column
Features["Utilisation"] = bt[["CustomerID","Utilisation"]].groupby("CustomerID").agg(Utilisation=("Utilisation",np.mean))  #calculate the mean  

#Gender of the customer:  
Features["gender"]=bt[["CustomerID","CustGender"]].groupby("CustomerID").last()

#Most frequent location of the customer
Features["Most_Frequent_Location"] = bt[["CustomerID","CustLocation"]].groupby("CustomerID").agg(CustLocation =("CustLocation",statistics.mode))


In [13]:
Features

Unnamed: 0_level_0,Num_of_Transaction,Balance>100_USD,Average_Amount,Average_Balance,Utilisation,gender,Most_Frequent_Location
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C1010011,2,2,2553.0,76340.635,73787.635,M,NOIDA
C1010012,1,1,1499.0,24204.490,22705.490,M,MUMBAI
C1010014,2,2,727.5,100112.950,99385.450,M,MUMBAI
C1010018,1,1,30.0,496.180,466.180,F,CHAMPARAN
C1010024,1,1,5000.0,87058.650,82058.650,M,KOLKATA
...,...,...,...,...,...,...,...
C9099836,1,1,691.0,133067.230,132376.230,M,BHIWANDI
C9099877,1,1,222.0,96063.460,95841.460,M,BANGALORE
C9099919,1,1,126.0,5559.750,5433.750,M,GUNTUR
C9099941,1,1,50.0,35295.920,35245.920,M,CHENNAI


### Adding 20 additional features

* 1 Cosutmer age
* 2 Maximum transaction of the costumer
* 3 Minimun transaction of the costumer
* 4 Important coustumer (Define as Average_Balance greater than 90% of the average balance of all coustumers
* 5 Teen ager (Define  20 years before the last transaction registered )


In [14]:
Features["Costumer_age"] = bt[["CustomerID","CustomerAge"]].groupby("CustomerID").last()

In [16]:
Features["Max_transaction"] = bt[["CustomerID","TransactionAmount (INR)"]].groupby("CustomerID").max()

In [17]:
Features["Min_transaction"] = bt[["CustomerID","TransactionAmount (INR)"]].groupby("CustomerID").min()

In [29]:
Features["Important_costumer"] = Features["Average_Balance"] > Features["Average_Balance"].quantile(.9)

In [152]:
d1 = datetime.datetime(max(bt.TransactionDate).year - 20 ,max(bt.TransactionDate).month , max(bt.TransactionDate).day)
bt['teen'] = bt.CustomerDOB > d1
Features['teen_ager'] = bt[["CustomerID","teen"]].groupby("CustomerID").last()



In [161]:
Features[Features.teen_ager == True]

Unnamed: 0_level_0,Num_of_Transaction,Balance>100_USD,Average_Amount,Average_Balance,Utilisation,gender,Most_Frequent_Location,Costumer_age,Max_transaction,Min_transaction,Important_costumer,teen_ager
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
C1010039,1,1,915.0,11027.180,10112.180,M,CHANDIGARH,24,915,915,False,True
C1010129,2,2,513.0,21771.040,21258.040,M,KOLKATA,25,1000,26,False,True
C1010486,1,1,56.0,499.750,443.750,M,BOISAR,25,56,56,False,True
C1010626,1,1,1000.0,41880.910,40880.910,F,VISAKHAPATNAM,25,1000,1000,False,True
C1010666,2,2,498.5,23165.925,22667.425,F,NANDURBAR,25,500,497,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
C9068860,1,0,189.0,15.010,-173.990,M,NALANDA,26,189,189,False,True
C9071451,1,1,500.0,35218.020,34718.020,M,THANE,25,500,500,False,True
C9081450,1,1,223.0,1394.880,1171.880,M,SILIGURI,25,223,223,False,True
C9081614,1,1,2623.0,8676.890,6053.890,F,WOKHA,25,2623,2623,False,True
