## Problem Statement

Develop a predictive framework to gauge the overall credit card spend capacity 

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Setting display options to ensure feature name visibility

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Importing the Data for ML

In [None]:
# Read the data
df = pd.read_csv('/Users/priyankac/Downloads/Projects/Data.xlsx - customer_dbase.csv', header = 0)

In [None]:
# Check the first few rows of the dataset
df.head()

In [None]:
# Checking the size of the dataset
df.shape

In [None]:
# Check the different data types in the dataset
df.dtypes


## Drop any ID kind of Feature

In [None]:
df = df.drop(['custid'], axis = 1)

In [None]:
df.head()

## Defining Independent and Target Feature

In [None]:
# Joining the features 'cardspent' and 'card2spent' to give a feature 'totspend'
df['totspend'] = df['cardspent'] + df['card2spent']

In [None]:
# Dropping 'cardspent' and 'card2spent' from our dataframe
df = df.drop(['cardspent', 'card2spent'], axis = 1)

In [None]:
# Checking the distribution of the column 'totspend'
df['totspend'].plot(kind = 'hist')

# The data is skewed
# There are outliers
# there is no normal distribution of data

In [None]:
# Doing log transformation of the above feature
df['logtotspend'] = np.log(df['totspend'])
df['logtotspend'].plot(kind = 'hist')


In [None]:
# Drop the parent feature 'totspend' as log transformation looks more normally distributed
df = df.drop(['totspend'], axis = 1)

In [None]:
# Creating dataframes containing the Independent and Dependent features
X = df.drop(['logtotspend'], axis = 1)
Y = df[['logtotspend']]

In [None]:
Y.mean()

## Split the features into Numerical and Categorical

In [None]:
# From the provided data dictionary we know the categorical variables
cat_var = ['region','townsize','gender','agecat','birthmonth','edcat','jobcat','union',
'employ','empcat','retire','inccat','default','jobsat','marital','spousedcat',
'homeown','hometype','address','addresscat','cars','carown','cartype',
'carcatvalue','carbought','carbuy','commute','commutecat','commutecar','commutemotorcycle',
'commutecarpool','commutebus','commuterail','commutepublic','commutebike','commutewalk',
'commutenonmotor','telecommute','reason','polview','polparty','polcontrib','vote','card',
'cardtype','cardbenefit','cardfee','cardtenure','cardtenurecat','card2','card2type',
'card2benefit','card2fee','card2tenure','card2tenurecat','active','bfast','churn','tollfree',
'equip','callcard','wireless','multline','voice','pager','internet','callid','callwait','forward',
'confer','ebill','owntv','ownvcr','owndvd','owncd','ownpda','ownpc','ownipod','owngame','ownfax','news',
'response_01','response_02','response_03']

In [None]:
# Creating a list of numerical features
numerical_var = []
for i in X.columns:
    if i not in cat_var:
        numerical_var.append(i)
print(numerical_var)        

In [None]:
# Creating the numerical and categorical dataframes
num = X.drop(cat_var, axis = 1)
char = X.drop(numerical_var, axis = 1)

In [None]:
# Converting all the categorical columns into type 'object' as they look like numbers
all_columns = list(char) # creates list of all column headers
char[all_columns] = char[all_columns].astype('object')

In [None]:
print('Shape of numerical features : ', num.shape)
print('Shape of categorical features : ', char.shape)

## Check for Descriptive Statistics

In [None]:
num.describe(percentiles = [0.01,0.05,0.10,0.25,0.50,0.75,0.95,0.99])

## Removal of Extreme Values and Outliers from the Numerical features


In [None]:
def outlier_cap(x):
    x = x.clip(lower = x.quantile(0.01))
    x = x.clip(upper = x.quantile(0.99))
    return(x)

In [None]:
num = num.apply(lambda x: outlier_cap(x))

In [None]:
num.describe(percentiles = [0.01,0.05,0.10,0.25,0.50,0.75,0.95,0.99])

## Missing Values handling - Numerical Features

In [None]:
num.isnull().mean()

In [None]:
# Keep only those columns that have >= 75% data populated
num = num.loc[:, num.isnull().mean() <= 0.25]
num.shape

## Missing Value Imputation - Numerical Features

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
num_1 = pd.DataFrame(imputer.fit_transform(num), index = num.index, columns = num.columns)

## Missing Values Handling - Categorical Features

In [None]:
char.isnull().mean().sum()

## Encode Categorical variables

In [None]:
char_encode = pd.get_dummies(char, drop_first = True)
char_encode.head()

## Build the complete feature set

In [None]:
X_all = pd.concat([char_encode, num_1], axis = 1, join = 'inner')

In [None]:
X_all.shape

In [None]:
# The dataset has a large number of features
# Using the Random Forest Regresssor to find important features 
from sklearn.ensemble import RandomForestRegressor
clf_rf = RandomForestRegressor(n_estimators =20)
clf_rf.fit(X_all, Y)

In [None]:
feature_importances = pd.DataFrame(clf_rf.feature_importances_,
                                  index = X_all.columns,
                                  columns = ['importance']).sort_values('importance', ascending = False)
feature_importances

In [None]:
# Slicing the top 40 important faetures
feature_list = feature_importances.iloc[0:40]

In [None]:
top_features = list(feature_list.index)
top_features

In [None]:
# Retaining the top 40 important features in the X_all dataframe
X_all = X_all[top_features]
X_all.head()

In [None]:
X_all.shape

## Let us perform Variable Clustering on the data to eliminate correlation among features

In [None]:
# Install VarClusHi
!pip install varclushi==0.1.0

In [None]:
from varclushi import VarClusHi
vc = VarClusHi(X_all, maxeigval2 = 1, maxclus = 8)
vc.varclus()

In [None]:
vc.info

In [None]:
check = vc.rsquare
check

In [None]:
# Selecting the features from each cluster having the lowest RS_ratio
# A variable selected from each cluster should have a high correlation with
# its own cluster and a low correlation with the other clusters
temp = check.groupby('Cluster')['RS_Ratio'].agg(['min'])
temp.columns = ['RS_Ratio']
temp

In [None]:
# Join the 'RS_Ratio' with the 'check' dataframe to get the feature names
filter = temp.merge(check, how = 'left', on = 'RS_Ratio')
filter

In [None]:
final_features = filter['Variable']


In [None]:
# Using the final_features to create the final dataframe
X_final = X_all[final_features]

In [None]:
# Checking the shape of final dataset
X_final.shape

In [None]:
X_final.dtypes

In [None]:
# The below snippet is to check for target variable discrimination as compared to the varying entries
# of the X-variable
check = pd.concat([X_final,Y], axis = 1, join = 'inner')
check.groupby('pets')['logtotspend'].agg(['min', 'mean', 'max'])

In [None]:
check = pd.concat([X_final,Y], axis = 1, join = 'inner')
check.groupby('card_3.0')['logtotspend'].agg(['min', 'mean', 'max'])

## Splitting the data into Train and Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, Y, test_size = 0.2, random_state = 20)

In [None]:
print('Shape of Training Data : ', X_train.shape)
print('Shape of Testing Data : ', X_test.shape)
print('Average Salary in Training Data : ', y_train.mean())
print('Aversge Salary in Testing Data : ', y_test.mean())

## Model Building Step

In [None]:
# Building Decision Tree Model
from sklearn.tree import DecisionTreeRegressor
dtree = DecisionTreeRegressor(random_state = 20)

In [None]:
# Min_Samples_Split starting from 5% of training base
from sklearn.model_selection import GridSearchCV
param_dist = {'max_depth': [3,4, 5, 6, 7,8], 'min_samples_split': [175,200,225,250,275,300] }
tree_grid = GridSearchCV(dtree, cv = 10, param_grid=param_dist,n_jobs = -1)
tree_grid.fit(X_train,y_train) 
print('Best Parameters using grid search: \n', tree_grid.best_params_)

In [None]:
dtree=DecisionTreeRegressor(random_state = 20, max_depth = 4, min_samples_split = 225)
dtree.fit(X_train,y_train)

In [None]:
# Building Random Forest Model
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(n_estimators=20)
rf.fit(X_train,y_train)


## Model Evaluation

### R Square Metric Between Training and Testing Sets

In [None]:
dtree_pred_train = dtree.predict(X_train)
dtree_pred_test = dtree.predict(X_test)
dtree_pred_final = dtree.predict(X_final)

X_final['pred_totspend_tree'] = pd.DataFrame(dtree_pred_final, index = X_final.index)

In [None]:
rf_pred_train = rf.predict(X_train)
rf_pred_test = rf.predict(X_test)
rf_pred_final = rf.predict(X_final)

X_final['pred_totspend_rf'] = pd.DataFrame(rf_pred_final, index = X_final.index)

In [None]:
# R Sqaure for Decision Tree
from sklearn.metrics import r2_score
r_sq_train=r2_score(dtree_pred_train,y_train)
r_sq_test

In [None]:
from sklearn.metrics import r2_score
r_sq_test=r2_score(dtree_pred_test,y_test)
r_sq_test

In [None]:
# R Square for Random Forest
from sklearn.metrics import r2_score
r_sq_train=r2_score(rf_pred_train,y_train)
r_sq_train

In [None]:
from sklearn.metrics import r2_score
r_sq_test=r2_score(rf_pred_test,y_test)
r_sq_test

In [None]:
# Random Forest performs better than Decision Tree, still R sqaure is not good in testing data

In [None]:
from sklearn import metrics
print('MSE for Test:',metrics.mean_squared_error(rf_pred_test,y_test))
print('MSE for Train:',metrics.mean_squared_error(rf_pred_train,y_train))

## Visualizing the Model Performance

In [None]:
data_eval = pd.concat([X_final, Y], axis = 1, join = 'inner')

## Create Buckets of Data Observations

In [None]:
data_eval['totspend_rank']=pd.qcut(data_eval['logtotspend'].rank(method='first').values,50,duplicates='drop').codes+1

## Plot the Actuals versus Predicted across those buckets

In [None]:
ax = sns.scatterplot(x='totspend_rank', y='logtotspend', data=data_eval, color='Blue')
ax = sns.lineplot(x='totspend_rank', y='pred_totspend_rf', data=data_eval, color='Red')

# Although the model did not perform very well on the test data,
# but from the graph we can see that it is very close to predicting the high spending customers
# Cannot rely exactly on the predicted value but can rely on the directional segmentation of the predicted values

## Error Cluster Capture

In [None]:
# Using the Error Cluster Capture Analysis to find the extent to which the model is performing the best 

In [None]:
# Get error percentage between Predicted and Actual Values
data_eval['error'] = data_eval['pred_totspend_rf']-data_eval['logtotspend']
data_eval['error_percentage'] = (data_eval['error']/df['logtotspend']).abs()
error_df = data_eval[['error_percentage']]

In [None]:
error_df.head(10)

In [None]:
# Build error cluster based on similar error values
from sklearn.preprocessing import KBinsDiscretizer
bins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='kmeans')
error_df_bin = pd.DataFrame(bins.fit_transform(error_df), index = error_df.index, 
                            columns = error_df.columns).add_suffix('_bin')
error_df = pd.concat([error_df, error_df_bin], axis = 1, join = 'inner')
error_df.head()

In [None]:
# Using the kmeans cluster to find for how many rows in the data do we have the lowest error percentage

In [None]:
# Analyse the error clusters for observation capture
model_eval= error_df.groupby('error_percentage_bin')['error_percentage'].agg(['min','max','mean','count'])
model_eval['cum_count'] = model_eval['count'].cumsum()
model_eval['cum_count_prop'] = model_eval['cum_count']/max(model_eval['cum_count'])

In [None]:
model_eval

In [None]:
# Visualize the cumulative Observation Capture by Error Clusters
ax=sns.lineplot(x=model_eval.index,y='cum_count_prop',data=model_eval)

In [None]:
ax=sns.barplot(x=model_eval.index,y='mean',data=model_eval)