# Car Insurance Claim Linear Regression Model



In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

pd.options.display.max_rows = 100
import warnings

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import statsmodels.api as sm

warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
#returns a df with whitespace replaced with underscores
def fill_whitespace_with_underscore(df):
    if isinstance(df,pd.core.frame.DataFrame):
        df.columns = df.columns.str.replace(' ','_')
        df.columns = df.columns.str.replace('-','_')
    else:
        raise TypeError
    return df

In [3]:
#returns a df of boolean values to used to index a trimmed table in the 1.5*IQR range
def Remove_Outlier_Indices(df):
    if isinstance(df,pd.core.frame.DataFrame):
        Q1 = df.quantile(0.15)
        Q3 = df.quantile(0.85)
        IQR = Q3 - Q1
        trueList = ~((df < (Q1 - 1.5*IQR)) |(df > (Q3 + 1.5*IQR)))
    else:
        raise TypeError
    return trueList

In [4]:
c2_df = pd.read_csv("C:/Users/pedro/Documents/GitHub/IronHackLabs/Week2/Data_Marketing_Customer_Analysis_Round3.csv")
c2_df

Unnamed: 0,region,customer_lifetime_value,response,coverage,education,effective_to_date,month,employment_status,gender,income,...,months_since_policy_inception,number_of_open_complaints,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,total_claim_amount,vehicle_class,vehicle_size
0,central,4809,no,basic,college,2/18/11,feb,employed,m,48029,...,52,0,9,corporate auto,corporate l3,offer3,agent,292,four-door car,medsize
1,west region,2228,no,basic,college,1/18/11,jan,unemployed,f,92260,...,26,0,1,personal auto,personal l3,offer4,call center,744,four-door car,medsize
2,east,14947,no,basic,bachelor,2/10/11,feb,employed,m,22139,...,31,0,2,personal auto,personal l3,offer3,call center,480,suv,medsize
3,north west,22332,yes,extended,college,1/11/11,jan,employed,m,49078,...,3,0,2,corporate auto,corporate l3,offer2,branch,484,four-door car,medsize
4,north west,9025,no,premium,bachelor,1/17/11,jan,medical leave,f,23675,...,31,0,7,personal auto,personal l2,offer1,branch,707,four-door car,medsize
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10684,central,15563,no,premium,bachelor,1/19/11,jan,unemployed,f,61541,...,40,0,7,personal auto,personal l1,offer3,web,1214,luxury car,medsize
10685,north west,5259,no,basic,college,1/6/11,jan,employed,f,61146,...,68,0,6,personal auto,personal l3,offer2,branch,273,four-door car,medsize
10686,central,23893,no,extended,bachelor,2/6/11,feb,employed,f,39837,...,63,0,2,corporate auto,corporate l3,offer1,web,381,luxury suv,medsize
10687,west region,11971,no,premium,college,2/13/11,feb,employed,f,64195,...,27,4,6,personal auto,personal l1,offer1,branch,618,suv,medsize


In [6]:
#Drop useless column
c2_df.drop(['effective_to_date'],axis=1,inplace=True)

In [7]:
c2_df.columns

Index(['region', 'customer_lifetime_value', 'response', 'coverage',
       'education', 'month', 'employment_status', 'gender', 'income',
       'location_code', 'marital_status', 'monthly_premium_auto',
       'months_since_last_claim', 'months_since_policy_inception',
       'number_of_open_complaints', 'number_of_policies', 'policy_type',
       'policy', 'renew_offer_type', 'sales_channel', 'total_claim_amount',
       'vehicle_class', 'vehicle_size'],
      dtype='object')

In [8]:
X = c2_df.drop(['total_claim_amount'], axis=1) # trimmed target variable
y = c2_df.total_claim_amount # trimmed independent var (vec)`

dM_n = X.select_dtypes(exclude=object)
dM_c = pd.get_dummies(X.select_dtypes(include=object),prefix="dmy",drop_first=True)

# Index List of Non-Outliers
nonOutlierList = Remove_Outlier_Indices(dM_n)

dM_c = dM_c[nonOutlierList.all(1)]
dM_n = dM_n[nonOutlierList.all(1)]

X = pd.concat([dM_n,dM_c],axis=1)
y = y[nonOutlierList.all(1)]

In [None]:
X

In [None]:
TEST_SIZE = .25
RANDOM_STATE = 123 # reproduce the same results. disables randomization
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE,random_state=RANDOM_STATE)

In [None]:
X_train.dtypes

In [None]:
X_train[dM_n.columns]

In [None]:
transformer = StandardScaler()
transformer.fit(X_train[dM_n.columns] )

X_test[dM_n.columns] = transformer.transform(X_test[dM_n.columns])
X_train[dM_n.columns] = transformer.transform(X_train[dM_n.columns])

### Apply linear regression model

In [None]:
model = LinearRegression()   # model is y = SUM(slopes*Xs) + intercept
model.fit(X_train, y_train) # compute for me the coefficients (slopes and intercept)
y_pred       = model.predict(X_test)   # model prediction [test]
y_pred_train = model.predict(X_train)  # model prediction [train]

In [None]:
fig, ax = plt.subplots(1,3,figsize=(14,4))

# forms a line?
ax[0].plot(y_pred, y_test, 'o', color='red')
ax[0].set_xlabel("y_test")
ax[0].set_ylabel("y_pred")
ax[0].set_title("test set predictions vs. actuals")

# normal distribution?
ax[1].hist(y_test - y_pred, color='blue')
ax[1].set_xlabel("Test y-y_pred")
ax[1].set_title("test set residuals")

# constant?
ax[2].plot(y_pred,y_test - y_pred,"o", color='green')
ax[2].set_xlabel("predited")
ax[2].set_ylabel("residuals")
ax[2].set_title("residual homoscedasticity")
ax[2].plot(y_pred,np.zeros(len(y_pred)),linestyle='dashed', color='black',linewidth=2)

In [None]:
sns.regplot(x='y_pred',y='y_test', data=pd.DataFrame({"y_test": y_test, "y_pred": y_pred}) ,
            scatter_kws={"color": "red"}, line_kws={"color": "black"})

In [None]:
R2     = r2_score(y_test,y_pred)
Adj_R2 = 1 - (1-R2)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)

R2,Adj_R2

In [None]:
X_train_const_ct = sm.add_constant(X_train) # adding a constant

model = sm.OLS(y_train, X_train_const_ct).fit()
predictions_train = model.predict(X_train_const_ct)

X_test_const_ct = sm.add_constant(X_test) # adding a constant
predictions_test = model.predict(X_test_const_ct)
print_model = model.summary()
print(print_model)


In [None]:
model.params

In [None]:
sigparams = model.params[list(np.where(model.pvalues < 0.05)[0])].iloc[0:].index.tolist()
sigparams

In [None]:
from statsmodels.formula.api import ols

In [None]:
X_train_df = fill_whitespace_with_underscore(pd.DataFrame(X_train, columns=X.columns))
y_train_df = pd.DataFrame(y_train)
data = pd.concat([X_train_df, y_train_df], axis=1)

In [None]:
headers = list(data.columns[:-1])
headers

In [None]:
eqn = 'total_claim_amount~' + '+'.join(headers)
eqn

In [None]:
model = ols(formula=eqn,data=data).fit()

In [None]:
model.summary()