# EDA

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

DEBUG = False
SEED = 666

## Load all the datasets

### Accounts dataset

In [2]:
df_accounts = pd.read_csv(f"orig/accounts_train.csv")
print(df_accounts.shape)
df_accounts.head(5)

(25012, 2)


Unnamed: 0,Customer,Account
0,828247016,Investment
1,828247016,Current
2,828247016,Credit Card
3,828247016,On Demand Deposit
4,828247016,Mortgage


### Customers Dataset

In [3]:
df_customers = pd.read_csv(f"orig/customers_train.csv")
print(df_customers.shape)
df_customers.head(5)

(6577, 15)


Unnamed: 0,Customer,Churn,Card,Start_Date,Customer_Service_Calls,Credit_Limit,Total_Revolving_Balance,Average_Open_To_Buy,Average_Utilisation_Ratio,Age,Gender,Education,Marital_Status,Dependents,Income
0,797197508,No,Silver,2020-06-01,1,2315.0,1565,750.0,0.676,48.0,F,Graduate,Married,2,Less than €30K
1,812854728,No,Silver,2020-01-01,1,7645.0,2076,5569.0,0.272,58.0,F,Second level,Single,5,Less than €30K
2,768000743,No,Silver,2020-01-01,2,6394.0,0,6394.0,0.0,55.0,F,Second level,Single,2,Less than €30K
3,722161439,No,Silver,2020-08-01,3,4663.0,0,4663.0,0.0,41.0,F,Graduate,,2,€30K - €50K
4,759029725,No,Silver,2021-02-01,4,1879.0,1486,393.0,0.791,37.0,F,Second level,Single,1,Less than €30K


### Transactions Dataset

In [4]:
df_transaction = pd.read_csv(f"orig/transactions_train.csv")
print(df_transaction.shape)
df_transaction.head(5)

(514611, 3)


Unnamed: 0,Customer,Date,Amount
0,828247016,2022-03-01,70.94
1,828247016,2022-02-21,52.16
2,828247016,2022-02-19,67.03
3,828247016,2022-02-19,48.24
4,828247016,2022-05-20,10.97


## Merge Datasets

In [5]:
df = pd.merge(df_accounts, df_customers, on="Customer")
df = pd.merge(df, df_transaction, on="Customer")
df.head(5)

Unnamed: 0,Customer,Account,Churn,Card,Start_Date,Customer_Service_Calls,Credit_Limit,Total_Revolving_Balance,Average_Open_To_Buy,Average_Utilisation_Ratio,Age,Gender,Education,Marital_Status,Dependents,Income,Date,Amount
0,828247016,Investment,No,Silver,2019-05-01,2,8256.0,864,7392.0,0.105,56.0,F,Graduate,Single,5,Less than €30K,2022-03-01,70.94
1,828247016,Investment,No,Silver,2019-05-01,2,8256.0,864,7392.0,0.105,56.0,F,Graduate,Single,5,Less than €30K,2022-02-21,52.16
2,828247016,Investment,No,Silver,2019-05-01,2,8256.0,864,7392.0,0.105,56.0,F,Graduate,Single,5,Less than €30K,2022-02-19,67.03
3,828247016,Investment,No,Silver,2019-05-01,2,8256.0,864,7392.0,0.105,56.0,F,Graduate,Single,5,Less than €30K,2022-02-19,48.24
4,828247016,Investment,No,Silver,2019-05-01,2,8256.0,864,7392.0,0.105,56.0,F,Graduate,Single,5,Less than €30K,2022-05-20,10.97


## Clean

### Summarise dataset

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1572916 entries, 0 to 1572915
Data columns (total 18 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Customer                   1572916 non-null  int64  
 1   Account                    1572916 non-null  object 
 2   Churn                      1568019 non-null  object 
 3   Card                       1572916 non-null  object 
 4   Start_Date                 1572916 non-null  object 
 5   Customer_Service_Calls     1572916 non-null  int64  
 6   Credit_Limit               1572916 non-null  float64
 7   Total_Revolving_Balance    1572916 non-null  int64  
 8   Average_Open_To_Buy        1572916 non-null  float64
 9   Average_Utilisation_Ratio  1572916 non-null  float64
 10  Age                        1508903 non-null  float64
 11  Gender                     1572916 non-null  object 
 12  Education                  1572916 non-null  object 
 13  Marital_Stat

#### Comments
Issues:
 - Account,Card,Gender,Education,Marital_Status,Income can be encoded to catgerical values
 - Churn (target) can be encoded to a boolean
 - Start_Date and Date can be encoded to DateTime type

### Encoding Object Values

#### All features with object data types

In [7]:
for c in df.columns:
    if df[c].dtype == "object":
        df[c] = pd.Categorical(df[c])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1572916 entries, 0 to 1572915
Data columns (total 18 columns):
 #   Column                     Non-Null Count    Dtype   
---  ------                     --------------    -----   
 0   Customer                   1572916 non-null  int64   
 1   Account                    1572916 non-null  category
 2   Churn                      1568019 non-null  category
 3   Card                       1572916 non-null  category
 4   Start_Date                 1572916 non-null  category
 5   Customer_Service_Calls     1572916 non-null  int64   
 6   Credit_Limit               1572916 non-null  float64 
 7   Total_Revolving_Balance    1572916 non-null  int64   
 8   Average_Open_To_Buy        1572916 non-null  float64 
 9   Average_Utilisation_Ratio  1572916 non-null  float64 
 10  Age                        1508903 non-null  float64 
 11  Gender                     1572916 non-null  category
 12  Education                  1572916 non-null  category
 1

In [9]:
df.Account.unique()

['Investment', 'Current', 'Credit Card', 'On Demand Deposit', 'Mortgage', 'Joint', 'Deposit', 'Loan']
Categories (8, object): ['Credit Card', 'Current', 'Deposit', 'Investment', 'Joint', 'Loan', 'Mortgage', 'On Demand Deposit']

In [11]:
df.to_csv("./orig/data/churn.csv", index=False)