In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

In [2]:
final_customer_df = pd.read_csv('final_customer_df.csv',index_col=0)

In [3]:
final_customer_df.head()

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,total_claim_amount,number_of_open_complaints,number_of_policies,coverage,education,...,Single,Four-Door Car,Luxury Car,Luxury SUV,SUV,Sports Car,Two-Door Car,month,day,week
0,14.759257,10.937988,0.970325,3.465736,1.609438,108.2186,0.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,24.0,8.0
1,17.849027,9.214034,0.973908,2.564949,3.73767,118.407894,0.0,8.0,2.0,2.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,31.0,5.0
2,20.132246,10.794809,0.975185,2.890372,3.637586,144.401037,0.0,2.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,19.0,7.0
3,18.175915,9.214034,0.975024,2.890372,4.174387,137.392989,0.0,7.0,1.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,20.0,3.0
4,14.815434,10.688211,0.971065,2.484907,3.78419,50.092051,0.0,1.0,1.0,2.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,5.0


In [4]:
final_customer_df.shape

(9134, 32)

In [5]:
# 1. In this final lab, we will model our data. Import sklearn train_test_split and separate the data.

X = final_customer_df.drop('total_claim_amount',axis=1)
y = final_customer_df['total_claim_amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=100)

In [6]:
# 2. Try a simple linear regression with all the data to see whether we are getting good results.

lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)
r2_score(y_test, predictions)

0.7593593493274404

In [7]:
# 3. Great! Now define a function that takes a list of models and train (and tests) them so we can try a lot of them without
# repeating code.
# 4. Use the function to check LinearRegressor and KNeighborsRegressor.
# 5. You can check also the MLPRegressor for this task!

In [8]:
def All_Regressions(X,y):
    tsize = [0.2,0.3,0.4,0.5,0.6,0.7,0.8]
    linear_results = []
    num_neighbors = [4,5,6,7,8,9,10,15,20]
    k4 = []
    k5 = []
    k6 = []
    k7 = []
    k8 = []
    k9 = []
    k10 = []
    k15 = []
    k20 = []
    kneighbors_results = [k4,k5,k6,k7,k8,k9,k10,k15,k20]
    mlp_results = []

    for i in range(len(tsize)):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=tsize[i], random_state=100)

        # Linear Regression
        lm = linear_model.LinearRegression()
        model = lm.fit(X_train, y_train)
        predictions = lm.predict(X_test)
        r_squared = r2_score(y_test, predictions)
        linear_results.append(r_squared)

        # K Neighbors Regression
        for k in range(len(num_neighbors)):
            kmodel = KNeighborsRegressor(n_neighbors=num_neighbors[k])
            kmodel.fit(X_train, y_train)
            kpredictions = kmodel.predict(X_test)
            k_score = kmodel.score(X_test, y_test)
            kneighbors_results[k].append(k_score)

        # Multi-Layer Perceptron (MLP) Regression
        mlp = MLPRegressor(hidden_layer_sizes=(50, 50))
        mlp.fit(X_train, y_train)
        predictions = mlp.predict(X_test)
        mlp_score = r2_score(y_test, predictions)
        mlp_results.append(mlp_score)

    tsize_dataframe = pd.DataFrame(tsize)
    tsize_dataframe.columns = ['test_size']
    linear_results_dataframe = pd.DataFrame(linear_results)
    linear_results_dataframe.columns = ['Linear']
    kresults_dataframe = pd.DataFrame(kneighbors_results).T
    kresults_dataframe.columns = ['k4','k5','k6','k7','k8','k9','k10','k15','k20']
    mlp_results_dataframe = pd.DataFrame(mlp_results)
    mlp_results_dataframe.columns = ['MLP']

    Results_Matrix = pd.concat([
        tsize_dataframe,
        linear_results_dataframe,
        kresults_dataframe,
        mlp_results_dataframe
    ],axis=1,join='outer')
    
    return Results_Matrix

In [9]:
All_Regressions(X,y)



Unnamed: 0,test_size,Linear,k4,k5,k6,k7,k8,k9,k10,k15,k20,MLP
0,0.2,0.761309,0.459071,0.45943,0.465142,0.471889,0.474265,0.474389,0.469903,0.457013,0.438297,0.797518
1,0.3,0.762916,0.445271,0.452534,0.458815,0.464108,0.464267,0.462858,0.460611,0.439678,0.428375,0.79419
2,0.4,0.759359,0.435785,0.446272,0.452296,0.455845,0.457145,0.451397,0.451833,0.42897,0.413039,0.785418
3,0.5,0.756894,0.429035,0.443111,0.44613,0.445232,0.440804,0.439515,0.432823,0.412926,0.391337,0.78419
4,0.6,0.756748,0.404605,0.420848,0.419975,0.419563,0.416851,0.419084,0.414648,0.39047,0.365799,0.758376
5,0.7,0.75868,0.373717,0.381292,0.382218,0.385004,0.385548,0.382971,0.379816,0.356346,0.32806,0.761153
6,0.8,0.759162,0.315237,0.32967,0.338971,0.342492,0.338963,0.33345,0.326048,0.296716,0.273556,0.743427


In [10]:
# 6. Check and discuss the results.

The KNeighbors yields the least accurate results for all Ks considered. Linear and MLP models are the most reliable. Linear's results are more constant accross all test sizes and MLP varies slightly, although the variance is less than 0.05.