In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import scipy.stats as stats

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 
from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

In [2]:
df = pd.read_csv('Insurance_Marketing-Customer-Value-Analysis.csv')
df.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,Location Code,Marital Status,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/2011,Employed,F,56274,Suburban,Married,69,32,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/2011,Unemployed,F,0,Suburban,Single,94,13,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/2011,Employed,F,48767,Suburban,Married,108,18,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/2011,Unemployed,M,0,Suburban,Married,106,18,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,02-03-11,Employed,M,43836,Rural,Single,73,12,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


In [3]:
df_9 = df[['Response', 'EmploymentStatus', 'Number of Open Complaints',
       'Number of Policies', 'Policy Type', 'Renew Offer Type',
       'Vehicle Class', 'Customer Lifetime Value', 'Monthly Premium Auto']]
df_9.head()

Unnamed: 0,Response,EmploymentStatus,Number of Open Complaints,Number of Policies,Policy Type,Renew Offer Type,Vehicle Class,Customer Lifetime Value,Monthly Premium Auto
0,No,Employed,0,1,Corporate Auto,Offer1,Two-Door Car,2763.519279,69
1,No,Unemployed,0,8,Personal Auto,Offer3,Four-Door Car,6979.535903,94
2,No,Employed,0,2,Personal Auto,Offer1,Two-Door Car,12887.43165,108
3,No,Unemployed,0,7,Corporate Auto,Offer1,SUV,7645.861827,106
4,No,Employed,0,1,Personal Auto,Offer1,Four-Door Car,2813.692575,73


In [4]:
#Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, feature_names ):
        self._feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit( self, X, y =None):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None):
        return X[ self._feature_names ] 

In [5]:
LE = LabelEncoder()
class CategoricalTransformer( BaseEstimator, TransformerMixin ):
    #Class constructor method that takes in a list of values as its argument
    def __init__(self, cat_cols = ['Response', 'EmploymentStatus', 'Number of Open Complaints',
       'Number of Policies', 'Policy Type', 'Renew Offer Type',
       'Vehicle Class']):
        self._cat_cols = cat_cols
        
    #Return self nothing else to do here
    def fit( self, X, y = None  ):
        return self
    
    #Helper function that converts values to Binary depending on input 
    #def label_encoder(self, obj):
        #LabelEncoder(obj)
    
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):
        
       #Depending on constructor argument break dates column into specified units
       #using the helper functions written above 
       if self._cat_cols:
        for i in X[cat_cols]:
            X[i]= LE.fit_transform(X[i])
        
       return X.values 

In [6]:
#Custom transformer we wrote to engineer features ( bathrooms per bedroom and/or how old the house is in 2019  ) 
#passed as boolen arguements to its constructor
class NumericalTransformer(BaseEstimator, TransformerMixin):
    #Class Constructor
    def __init__( self, MPA_log = True):
        self._MPA_log = MPA_log
        
    #Return self, nothing else to do here
    def fit( self, X, y = None):
        return self 
    
    #Custom transform method we wrote that creates aformentioned features and drops redundant ones 
    def transform(self, X, y = None):
        if self._MPA_log:
            X.loc[:,'MPA_log'] = np.log(X['Monthly Premium Auto'])
            X.drop(['Monthly Premium Auto'], axis =1)
        
        return X.values
    

In [7]:
#Categrical features to pass down the categorical pipeline 
categorical_features = ['Response', 'EmploymentStatus', 'Number of Open Complaints',
       'Number of Policies', 'Policy Type', 'Renew Offer Type',
       'Vehicle Class']


#Defining the steps in the categorical pipeline 
categorical_pipeline = Pipeline( steps = [ ( 'cat_selector', FeatureSelector(categorical_features) ),( 'cat_transformer', CategoricalTransformer() )])
                                  
         

#Numerical features to pass down the numerical pipeline 
numerical_features = ['Monthly Premium Auto']

    
#Defining the steps in the numerical pipeline     
numerical_pipeline = Pipeline( steps = [ ( 'num_selector', FeatureSelector(numerical_features) ),
                                  
                                  ( 'num_transformer', NumericalTransformer() ) ] )

full_pipeline = FeatureUnion( transformer_list = [ ( 'categorical_pipeline', categorical_pipeline ), ( 'numerical_pipeline', numerical_pipeline ) ] )


In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X = df_9.drop(['Customer Lifetime Value'], axis = 1)
y = df_9['Customer Lifetime Value']

X_train, X_test, y_train, y_test = train_test_split( X, y , test_size = 0.2 , random_state = 0 )


In [9]:
y_train = np.log(y_train)

In [10]:
y_test = np.log(y_test)

In [11]:
#The full pipeline as a step in another pipeline with an estimator as the final step

full_pipeline_LR = Pipeline( steps = [('full_pipeline', full_pipeline),
                                  
                                  ('model', LinearRegression())])


full_pipeline_DT = Pipeline( steps = [('full_pipeline', full_pipeline),
                                  
                                  ('model', DecisionTreeRegressor(max_depth=21, min_samples_leaf= 8, random_state=0))])


full_pipeline_RF = Pipeline( steps = [('full_pipeline', full_pipeline),
                                  
                                  ('model', RandomForestRegressor(max_depth=21, min_samples_leaf= 8, random_state=0))])

In [12]:
y_train.shape

(7307,)

In [13]:
cat_cols = ['Response', 'EmploymentStatus', 'Number of Open Complaints',
       'Number of Policies', 'Policy Type', 'Renew Offer Type',
       'Vehicle Class']

In [14]:
#Can call fit on it just like any other pipeline
full_pipeline_LR.fit(X_train, y_train)

full_pipeline_DT.fit(X_train, y_train)

full_pipeline_RF.fit(X_train, y_train)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs

Pipeline(memory=None,
         steps=[('full_pipeline',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('categorical_pipeline',
                                                 Pipeline(memory=None,
                                                          steps=[('cat_selector',
                                                                  FeatureSelector(feature_names=None)),
                                                                 ('cat_transformer',
                                                                  CategoricalTransformer(cat_cols=None))],
                                                          verbose=False)),
                                                ('numerical_pipeline',
                                                 Pipeline(memory=None,
                                                          steps=[('num_selector',
                                                                  FeatureSelecto

In [15]:
from sklearn.metrics import r2_score , mean_squared_error , mean_absolute_error
#Can predict with it like any other pipeline
y_pred_LR = full_pipeline_LR.predict(X_test) 
print('LR')
print ( "R2 Squared:" , r2_score ( y_test , y_pred_LR ) )
print ( "Root Mean Squared Error:" , np.sqrt ( mean_squared_error ( y_test , y_pred_LR ) ) )
print ( "Mean Absolute Error:" , np.sqrt ( mean_absolute_error ( y_test , y_pred_LR ) ))


y_pred_DT = full_pipeline_DT.predict(X_test) 
print('DT')
print ( "R2 Squared:" , r2_score ( y_test , y_pred_DT ) )
print ( "Root Mean Squared Error:" , np.sqrt ( mean_squared_error ( y_test , y_pred_DT ) ) )
print ( "Mean Absolute Error:" , np.sqrt ( mean_absolute_error ( y_test , y_pred_DT ) ))


y_pred_RF = full_pipeline_RF.predict(X_test) 
print('RF')
print ( "R2 Squared:" , r2_score ( y_test , y_pred_RF ) )
print ( "Root Mean Squared Error:" , np.sqrt ( mean_squared_error ( y_test , y_pred_RF ) ) )
print ( "Mean Absolute Error:" , np.sqrt ( mean_absolute_error ( y_test , y_pred_RF ) ))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs

LR
R2 Squared: 0.25069358448830326
Root Mean Squared Error: 0.5491187788354082
Mean Absolute Error: 0.6514614444409706


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


DT
R2 Squared: 0.8853190563230513
Root Mean Squared Error: 0.21482370353778715
Mean Absolute Error: 0.32364202412999177


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


RF
R2 Squared: 0.899300412718651
Root Mean Squared Error: 0.20130307816086196
Mean Absolute Error: 0.3186084799587713


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [16]:
y_test

2182     8.629174
7823     7.748150
1651     9.163346
888      9.057338
3844     8.349939
2000     8.367177
3631     9.086494
4895     8.236597
4651     8.176776
6984     8.708084
6747     8.547677
4576     8.929358
6916     7.674191
3784     9.011411
154      7.988256
1691     7.952067
4834     7.780686
1044     7.825822
7850     8.040124
4592     9.679025
5032     8.857237
4539     9.452647
4149     7.742860
8903     9.097885
33       8.520963
7584     9.002458
272      8.322534
1270     8.660097
6923     8.248482
406      8.461969
7897     8.834140
9115     9.364261
7168     8.787822
8290     8.463022
2669     8.651423
8481     9.115497
4066     8.430032
4565     9.747055
3037     8.835625
2092     9.728796
3535     8.937289
2843     8.921023
6775     7.932901
325     10.003492
4463     8.605061
7052     8.343474
5099     8.643674
134      8.990489
3740     8.203213
7618     7.950389
831      8.607369
4115     9.446487
5003     8.958952
8743     7.958731
242      8.434956
2630     8

In [17]:
y_pred_RF

array([8.63108527, 7.81654712, 9.15240221, ..., 8.26414478, 8.57865236,
       9.4924372 ])

In [18]:
y_pred_DT

array([8.6299679 , 7.83229597, 9.15084485, ..., 8.26208315, 8.58235442,
       9.56382273])

In [19]:
pred_log9 = pd.DataFrame(y_test)
pred_log9['RF'] = y_pred_RF
pred_log9['boolean'] = pred_log9['Customer Lifetime Value']==pred_log9['RF']
pred_log9['Accuracy %'] = (pred_log9['RF']/pred_log9['Customer Lifetime Value'])*100
pred_log9['Error %'] = 100-pred_log9['Accuracy %']

pred_log9['Actual CLV'] = df['Customer Lifetime Value']
pred_log9['Predicted CLV'] = np.exp(pred_log9['RF'])
pred_log9['Actual Accuracy %'] = (pred_log9['Predicted CLV']/pred_log9['Actual CLV'])*100
pred_log9['Actual Error %'] = 100-pred_log9['Actual Accuracy %']
pred_log9.head()

Unnamed: 0,Customer Lifetime Value,RF,boolean,Accuracy %,Error %,Actual CLV,Predicted CLV,Actual Accuracy %,Actual Error %
2182,8.629174,8.631085,False,100.022152,-0.022152,5592.455266,5603.155866,100.19134,-0.19134
7823,7.74815,7.816547,False,100.882751,-0.882751,2317.282049,2481.322886,107.079019,-7.079019
1651,9.163346,9.152402,False,99.880575,0.119425,9540.923395,9437.082999,98.911632,1.088368
888,9.057338,9.106226,False,100.539756,-0.539756,8581.278701,9011.219927,105.010223,-5.010223
3844,8.349939,8.424638,False,100.894608,-0.894608,4229.923123,4557.995805,107.755996,-7.755996


In [20]:
pred_log9.to_csv('No data leak Log9.csv')