## Problem Statement

Build a Machine Learning Model that classifies customers into high revenue and low revenue

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle ## helps storing data in pickle files
%matplotlib inline

## Setting Display options 

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Warning Suppression 

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Import Data for ML Project

In [None]:
# Reading the data
data = pd.read_csv('/Users/priyankac/Downloads/Projects/Existing_Base.csv')

In [None]:
# Check the first few records
data.head()

In [None]:
# Check the number of rows and columns
data.shape

In [None]:
# Check information about the data
data.info()

## Insight##
# Out of 32 columns, we have around 18 columns of numeric type, 14 columns of string type
# There doesn't seem to have missing values in the columns, however we would investigate this once again

In [None]:
#### Visualizing numeric columns ######

numeric_cols = data.select_dtypes(include = np.number) ### selects numeric columns

column_names = list(numeric_cols.columns)

col_index = 0

plot_rows = 6
plot_cols = 3

fig, ax = plt.subplots(nrows = plot_rows,ncols=plot_cols,figsize = (20,20))

for row_count in range(plot_rows):
    for col_count in range(plot_cols):
        ax[row_count][col_count].scatter(y = numeric_cols[column_names[col_index]],x=numeric_cols.index)
        ax[row_count][col_count].set_ylabel(column_names[col_index])
        col_index = col_index + 1
        
###### Insights #####
#####################

## Ref Number column just contains the index so can be removed.
## year_last_moved column seem to have many values close to 0 or 0, these seem to be missing values, investigate them.
## Average_credit_card_transaction,Balance Transfer seems to be highly right skewed, check if the skewness could be reduced.
## A single customer with very high value of balance transfer/life insurance etc stands out, this row could be considered outlier and be removed,
## Capping could be done to limit the impact of outliers.
## Scaling could be done since the scale is different for most of the variables.

## Drop ID type of feature('REF_NO')

In [None]:
data = data.drop(['REF_NO'], axis = 1)

In [None]:
data.dtypes

## Label the Target Feature to 0/1

In [None]:
# Distribution of the Target Feature
data['Revenue Grid'].value_counts()

# There is class imbalance in my dataset(9069 customers are low revenue as compared to 1086 high revenue customers)
# The distribution is between 80%-20% and 95%-5% -  area of slight concern..


In [None]:
data['target'] = np.where(data['Revenue Grid'] == 2, 0,1)

## Drop the 'Revenue Grid' Feature to retain 'Target' Feature

In [None]:
data = data.drop(['Revenue Grid'], axis = 1)

In [None]:
data['target'].value_counts()


In [None]:
data.dtypes

## Defining the Target and Independent Features

In [None]:
Y = data[['target']]
X = data.drop(['target'], axis = 1)

In [None]:
X.shape

In [None]:
Y.shape

In [None]:
Y.mean()

## Split the features into Numerical and Categorical

In [None]:
num = X.select_dtypes(include = 'number')
char = X.select_dtypes(include = 'object')

In [None]:
num.shape


In [None]:
char.shape

In [None]:
num.head()

In [None]:
char.head()

## Check Distribution of Numeric Features

In [None]:
sns.distplot(data['Average Credit Card Transaction'],hist = False)

In [None]:
sns.distplot(data['Balance Transfer'],hist = False)

In [None]:
sns.distplot(data['Term Deposit'],hist = False)

In [None]:
sns.distplot(data['Life Insurance'],hist = False)

In [None]:
sns.distplot(data['Medical Insurance'],hist = False)

In [None]:
sns.distplot(data['Average A/C Balance'],hist = False)

In [None]:
sns.distplot(data['Personal Loan'],hist = False)

In [None]:
sns.distplot(data['Investment in Mutual Fund'],hist = False)

In [None]:
sns.distplot(data['Investment Tax Saving Bond'],hist = False)

In [None]:
sns.distplot(data['Home Loan'],hist = False)

In [None]:
sns.distplot(data['Online Purchase Amount'],hist = False)

In [None]:
sns.distplot(data['Portfolio Balance'],hist = False)

In [None]:
sns.distplot(data['Investment in Commudity'],hist = False)

In [None]:
sns.distplot(data['Investment in Equity'],hist = False)

In [None]:
sns.distplot(data['Investment in Derivative'],hist = False)

In [None]:
# All the numeric columns show skewness

In [None]:
# Check the number of unique feature levels in numeric columns
def unique_levels(x):
    x = x.value_counts().count()
    return(x)

df_value_counts = pd.DataFrame(num.apply(lambda x : unique_levels(x)))
df_value_counts

In [None]:
df_value_counts.columns = ['feature_levels']
df_value_counts

# it is observed that faeture level in each feature is more that 25, hence no action to be taken

## Outlier Analysis

In [None]:
num.describe(percentiles = [0.01,0.05,0.10,0.25,0.50,0.75,0.85,0.9,0.99])

# There seems to be huge difference between the 99th percentile and maximum value in most of the features
# showing presence of outliers, Capping could be done to limit the impact of outliers.

## Flooring and Capping of Outliers

In [None]:
def outlier_cap(x):
    x = x.clip(lower = x.quantile(0.01))
    x = x.clip(upper = x.quantile(0.99))
    return(x)

In [None]:
num = num.apply(lambda x : outlier_cap(x))

In [None]:
num.describe(percentiles = [0.01,0.05,0.10,0.25,0.50,0.75,0.85,0.9,0.99])


## Missing Values Analysis

In [None]:
num.isnull().sum()

In [None]:
char.isnull().sum()

In [None]:
# There is no missing values in this dataset

## Feature Selection - Numerical Features

### Part 1 : Remove Features with 0 Variance

In [None]:
from sklearn.feature_selection import VarianceThreshold
varselector = VarianceThreshold(threshold = 0)
varselector.fit_transform(num)

# Get columns to keep and create new dataframe with those only
cols = varselector.get_support(indices = True)
num_1 = num.iloc[:, cols]

In [None]:
num_1.iloc[0]

### Part 2 : Bi Variate Analysis(KBinsDiscretizer)

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

discrete = KBinsDiscretizer(n_bins = 10, encode = 'ordinal', strategy = 'quantile')
num_binned = pd.DataFrame(discrete.fit_transform(num_1), index = num_1.index, 
                          columns = num_1.columns).add_suffix('_Rank')
num_binned.head()

In [None]:
# Check if the features show a slope at all
# If they do, then do you see some deciles below the population average and some higher than the population average?
# If that is the case then the slope will be strong

# Conclusion: A strong slope is indicator of the features' ability to discriminate the event from non event
#             making it a good predictor

X_bin_combined = pd.concat([Y, num_binned], axis = 1, join = 'inner')

from numpy import mean
for cols in (num_binned.columns):
    plt.figure()
    sns.lineplot(x = cols, y = X_bin_combined['target'].mean(), data = X_bin_combined, color = 'red')
    sns.barplot(x = cols, y = 'target', data = X_bin_combined, estimator = mean)
plt.show()    

In [None]:
# plotting the above using scatterplot and lineplot
for cols in (num_binned.columns):
    plt.figure()
    sns.scatterplot(x = cols, y = X_bin_combined['target'].mean(), data = X_bin_combined, color = 'red')
    sns.lineplot(x = cols, y = 'target', data = X_bin_combined, estimator = mean)
plt.show()

In [None]:
# Dropping the feature year_last_moved

num = num.drop(['year_last_moved'], axis = 1)

In [None]:
num.dtypes

In [None]:
# All the features from the num will get selected due to good discrimination power by all of them
select_features_df_num = num

In [None]:
select_features_df_num.shape

## Feature Selection - Categorical Features

### Part 1 : Bi Variate Analysis

In [None]:
X_char_merged = pd.concat([Y, char], axis = 1, join = 'inner')

from numpy import mean
for col in (char.columns):
    plt.figure()
    sns.lineplot(x = col, y = X_char_merged['target'].mean(), data = X_char_merged, color = 'red')
    sns.barplot(x = col, y = 'target', data = X_char_merged, estimator = mean)
plt.show()    

In [None]:
# Dropping the features that do no have any slope visible
char = char.drop(['TVarea', 'post_code', 'post_area','region'], axis = 1)

In [None]:
char.dtypes

In [None]:
# Create dummy features with n-1 levels
X_char_dum = pd.get_dummies(char, drop_first = True)
X_char_dum.shape

### Part 2 : Select KBest

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
selector = SelectKBest(chi2, k = 52)
selector.fit_transform(X_char_dum, Y)

# Get the columns to keep and create new dataframe with those only
cols = selector.get_support(indices = True)
select_features_df_char = X_char_dum.iloc[:, cols]

In [None]:
select_features_df_char.dtypes

## Creating the Master Feature Set for Model Development

In [None]:
X_all = pd.concat([select_features_df_char, select_features_df_num], axis = 1, join = 'inner')

In [None]:
X_all.shape

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_all, Y, test_size = 0.2, random_state =20)

In [None]:
print('Shape of Training data : ',X_train.shape)
print('Shape pf Testing data : ',X_test.shape)
print('Revenue Rate in Training data : ',y_train.mean())
print('Revenue Rate in Testing data : ',y_test.mean())

## Model Building 

In [None]:
# We start with fitting the logistic regression model, this would serve as a benchmark model 
# since Logistic model doesn't have alot of parameters we won't create a validation set for hyperparameter tuning

from sklearn.linear_model import LogisticRegression
LR_model = LogisticRegression(class_weight = 'balanced',max_iter=200,random_state=20)  

In [None]:
LR_model

In [None]:
# Fitting the model
LR_model.fit(X_train, y_train)

In [None]:
y_pred_prob = LR_model.predict_proba(X_all)[:, 1]
data['pred_prob_logreg'] = pd.DataFrame(y_pred_prob)

In [None]:
y_pred_prob

In [None]:
prediction_train = LR_model.predict_proba(X_train)

In [None]:
pd.DataFrame(prediction_train)

In [None]:
pd.Series(LR_model.predict(X_train)) ### applies a threshold of 0.5

In [None]:
# Plotting the roc curve for the model fit 

from sklearn.metrics import roc_auc_score, plot_roc_curve, confusion_matrix,f1_score ## model evaluation metrics
plot_roc_curve(LR_model,X =X_train, y= y_train)

In [None]:
### Getting confusion matrix, F-score on the train data 

print(confusion_matrix(y_true = y_train, y_pred = LR_model.predict(X_train))) ### confusion matrix for pred on train set

print ('The F1-SCORE on the train set prediction ',f1_score(y_true=y_train,y_pred = LR_model.predict(X_train),sample_weight = y_train))

In [None]:
# Lets evaluate the performace on the test set 


plot_roc_curve(LR_model,X =X_test, y= y_test)

predicted_val = LR_model.predict(X_test)

print (confusion_matrix(y_true = y_test, y_pred = predicted_val))

print ('The F1-SCORE on the test set prediction ',f1_score(y_true=y_test,y_pred = predicted_val,sample_weight =y_test))

In [None]:
# Building a Decision Tree Model
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier(criterion = 'gini', random_state =20)

In [None]:
# Using GridSearchCV to find the best parameters

np.random.seed(44)
from sklearn.model_selection import GridSearchCV
param_dist = {'max_depth': [6,7,8,9], 'min_samples_split': [50, 100, 150, 200,250]}
tree_grid = GridSearchCV(dtree, cv = 10, param_grid = param_dist, n_jobs = -1)
tree_grid.fit(X_train, y_train)
print('Best parameters using grid search: \n', tree_grid.best_params_)

In [None]:
dtree = DecisionTreeClassifier(criterion = 'gini', random_state =20, max_depth = 6, min_samples_split = 50)
dtree.fit(X_train, y_train)

In [None]:
# Evaluating on the train and the test set 

predicted_train = dtree.predict(X_train)

plot_roc_curve(dtree,X =X_train, y= y_train)


print(confusion_matrix(y_true = y_train, y_pred = predicted_train))

print ('The F1-SCORE on the train set prediction ',f1_score(y_true=y_train,y_pred = predicted_train,sample_weight = y_train))

In [None]:
plot_roc_curve(dtree,X =X_test, y= y_test)

predicted_test = dtree.predict(X_test)

print (confusion_matrix(y_true = y_test, y_pred = predicted_test))

print ('The F1-SCORE on the test set prediction ',f1_score(y_true=y_test,y_pred = predicted_test,sample_weight = y_test))

In [None]:
# Calculating the feature importances 

importances = dtree.feature_importances_

columns = list(X_train.columns)

importances_dict = {columns[i]: importances[i] for i in range(len(columns))}

importances_dict = dict(sorted(importances_dict.items(), key=lambda importances_dict: importances_dict[1],reverse=True))

In [None]:
# Getting feature importances
import pandas as pd
feature_importances = pd.DataFrame(dtree.feature_importances_,
                                  index = X_train.columns,
                                  columns = ['importance']).sort_values('importance', ascending = False)
feature_importances

In [None]:
from sklearn.tree import plot_tree
plot_rows =1
plot_cols = 1

fig, ax1 = plt.subplots(nrows = plot_rows,ncols=plot_cols,figsize = (20,20))

plot_tree(dtree,ax=ax1,fontsize=10)
plt.show()

In [None]:
# Building Random Forest Model
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(criterion = 'gini', random_state = 20, max_depth = 6, min_samples_split = 50)
rf.fit(X_train, y_train)

In [None]:
# Evaluating on the train and the test set 

predicted_train = rf.predict(X_train)

plot_roc_curve(rf,X =X_train, y= y_train)

print ('The score for the DT model ', roc_auc_score(y_train,predicted_train))

print(confusion_matrix(y_true = y_train, y_pred = predicted_train))

print ('The F1-SCORE on the train set prediction ',f1_score(y_true=y_train,y_pred = predicted_train,sample_weight = y_train))

In [None]:
plot_roc_curve(rf,X =X_test, y= y_test)

predicted_test = rf.predict(X_test)

print (confusion_matrix(y_true = y_test, y_pred = predicted_test))

print ('The F1-SCORE on the test set prediction ',f1_score(y_true=y_test,y_pred = predicted_test,sample_weight = y_test))

In [None]:
# Getting feature importances
import pandas as pd
feature_importances = pd.DataFrame(rf.feature_importances_,
                                  index = X_train.columns,
                                  columns = ['importance']).sort_values('importance', ascending = False)
feature_importances

## Conclusion

Logistic Regression Model seemed to give the best f1 score on both train and test data