### Importing the required packages

In [None]:
# Packages / libraries
import os #provides functions for interacting with the operating system
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import warnings

%matplotlib inline

# To install sklearn type "pip install numpy scipy scikit-learn" to the anaconda terminal

# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

# Increases the size of sns plots
sns.set(rc={'figure.figsize':(10,8)})


# remove warnings
warnings.filterwarnings("ignore")


### Load the raw data from csv file

In [None]:

# Loading the data
raw_data = pd.read_csv("BankChurners.csv")

# print the shape
print(raw_data.shape)

raw_data = raw_data.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis=1)

raw_data

In [None]:
raw_data.dtypes

In [None]:
raw_data.loc[10125]

### Data Preprocessing
#### Exploratory Data Analysis & Data Cleaning

In [None]:
churn_rate = len(raw_data.loc[raw_data['Attrition_Flag']== 'Attrited Customer'])*100/len(raw_data)
churn_rate

In [None]:
# Investigate all the elements whithin each Feature 

for column in raw_data:
    unique_vals = np.unique(raw_data[column])
    nr_values = len(unique_vals)
    if nr_values < 10:
        print('The number of values for feature {} :{} -- {}'.format(column, nr_values,unique_vals))
    else:
        print('The number of values for feature {} :{}'.format(column, nr_values))

In [None]:
raw_data.dtypes

In [None]:
# Checking for null values

raw_data.isnull().sum()

In [None]:
# Getting the columns we need

raw_data.columns

In [None]:
# #selecting the columns that I want to check the correlation between them
n_variables = [ 'Attrition_Flag', 'Customer_Age', 'Gender',
       'Dependent_count', 'Education_Level', 'Marital_Status','Income_Category', 'Card_Category', 'Months_on_book',
    'Total_Relationship_Count', 'Months_Inactive_12_mon','Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal','Avg_Open_To_Buy', 
               'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt','Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']

# creating the correlation dataset using Pearson method (linear relation)
pc = raw_data.corr(method ='pearson')
pc


In [None]:
# Looping through all the features by our y variable - see if there is relationship

features = ['Attrition_Flag',  'Gender','Customer_Age',
       'Dependent_count', 'Education_Level', 'Marital_Status',
       'Income_Category', 'Card_Category', 'Total_Relationship_Count',
        'Months_Inactive_12_mon', 'Contacts_Count_12_mon']

for f in features:
    sns.countplot(x = f, data = raw_data, palette = 'Set3')
    plt.xticks(rotation=45)
    plt.show()
    

### Feature Engineering - Numeric Representation


In [None]:
#Preprocessing numerical
numerical = raw_data.select_dtypes(exclude='object')

In [None]:
##preprocessing categorical
categorical = raw_data.select_dtypes(include='object')

In [None]:
# choosing the feutures (just original numerical data)
new_raw_data =numerical

In [None]:
print(new_raw_data.shape)

#### Save the numerical data

In [None]:
# Remover column header and index
new_raw_data.to_csv(r"C:\Users\user\Desktop\AI projects\Churn-prediction/bank_model_data.csv",index=False)