# Comparing regression models
### lab 4.05

In [16]:
# Base libraries
import pandas as pd
import numpy as np

#pd.set_option('display.max_columns', None)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns


# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

In [17]:
categorical = pd.read_csv('clean_categorical.csv')
numerical = pd.read_csv('numerical_cleaned.csv')
categorical.head()
#categorical.shape ((9134, 14)
#numerical.shape (9134, 8)


Unnamed: 0,state,response,coverage,education,employmentstatus,gender,location_code,marital_status,policy_type,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size
0,Washington,No,0.0,0.5,2.0,F,0.5,Married,Corporate Auto,Corporate L3,Offer1,Agent,Two-Door Car,0.5
1,Arizona,No,0.5,0.5,0.0,F,0.5,Single,Personal Auto,Personal L3,Offer3,Agent,Four-Door Car,0.5
2,Nevada,No,1.0,0.5,2.0,F,0.5,Married,Personal Auto,Personal L3,Offer1,Agent,Two-Door Car,0.5
3,California,No,0.0,0.5,0.0,M,0.5,Married,Corporate Auto,Corporate L2,Offer1,Call Center,SUV,0.5
4,Washington,No,0.0,0.5,2.0,M,0.0,Single,Personal Auto,Personal L1,Offer1,Agent,Four-Door Car,0.5


## Concat

In [18]:
#index is the same so I will concat
df= pd.concat([categorical, numerical], axis=1)
#df.describe().T
# Check nulls df.isna().sum()/len(categorical)

# Model

## X Y Split

In [19]:

y=df['total_claim_amount'] #is target
X = df.drop(['total_claim_amount'],axis=1)

## Train Test split

In [22]:
#train/test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (7307, 21)
X_test shape: (1827, 21)
y_train shape: (7307,)
y_test shape: (1827,)


## Numericals

In [21]:
numericals_train = X_train.select_dtypes(np.number) #or: X_train._get_numeric_data()
numericals_test = X_test.select_dtypes(np.number)
numericals_train.head()

Unnamed: 0,coverage,education,employmentstatus,location_code,vehicle_size,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
7706,0.5,0.5,2.0,0.5,0.5,3265.156348,25820,82,10,69,0,1
779,0.5,0.0,2.0,0.5,0.5,11318.13083,79270,95,28,61,3,2
8189,0.5,0.0,0.0,0.5,0.0,6274.447705,13662,85,29,37,0,3
1942,1.0,0.0,2.0,0.5,1.0,4297.189057,66331,107,0,74,0,1
459,0.5,0.5,0.0,1.0,1.0,10110.77818,15752,90,27,37,0,2


### Fit Numericals

In [30]:
from sklearn.preprocessing import StandardScaler
    #standard scaler is based on mean and std dev
transformer = StandardScaler().fit(numericals_train) 
    #it is fitted on train data, and then applied on both train and test
numericals_train_standardized = transformer.transform(numericals_train)
numericals_test_standardized = transformer.transform(numericals_test)

## Categoricals

In [31]:
#categoricals are only the categoricals to OneHot encode
categoricals_train= X_train.select_dtypes(object)
categoricals_test= X_test.select_dtypes(object)
categoricals_train.head()
#what's left to encode is only OneHot Encode
#The general recommendation is to perform encoding after splitting the data into training and testing sets

Unnamed: 0,state,response,gender,marital_status,policy_type,policy,renew_offer_type,sales_channel,vehicle_class
7706,California,Yes,F,Divorced,Personal Auto,Personal L2,Offer1,Agent,Four-Door Car
779,California,No,F,Married,Personal Auto,Personal L1,Offer2,Branch,Four-Door Car
8189,Oregon,No,F,Single,Personal Auto,Personal L1,Offer1,Agent,Four-Door Car
1942,Nevada,No,F,Single,Personal Auto,Personal L3,Offer4,Branch,Two-Door Car
459,California,No,F,Married,Corporate Auto,Corporate L3,Offer1,Branch,Two-Door Car


In [32]:

#X_cat_ordinal= df[['coverage','employmentstatus','location_code','vehicle_size','education']]
#X_cat_onehot=df[['state','marital_status','policy_type','policy','renew_offer_type','sales_channel','vehicle_class','response','gender']]

### encoding categoricals
column | encoding| encoder type
-------|---------|---------
state | only 5 - leave as is | One hot
response | imbalanced make 1,0 | One hot
coverage| ordinal encode | Ordinal
education | combine master & dr. | Ordinal 
employment status | smallest 3 into other | Ordinal
gender | 1hot encode | One hot
location | ordinal encode as is | Ordinal 
marital | 1hot encode as is | One hot
policy type | combine corporate/special | One hot
policy | drop column | One hot
renew offer | encode as is | One hot
sales channel | 1hot encode as is | One hot
vehicle class | 1hot combine luxury/sports | One hot
vehicle size | ordinal encode | Ordinal

In [33]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown='error', drop='first').fit(categoricals_train) #always fit on the TRAIN not the test
#X_cat_onehot
categoricals_train_encoded = encoder.transform(categoricals_train).toarray()
categoricals_test_encoded = encoder.transform(categoricals_test).toarray()

In [34]:
X_train = np.concatenate((numericals_train_standardized,categoricals_train_encoded),axis=1)
X_test = np.concatenate((numericals_test_standardized,categoricals_test_encoded),axis=1)
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,0.788110,0.270934,0.778580,0.030374,0.168956,-0.689028,-0.385792,-0.325793,-0.503413,0.755270,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.788110,-1.365195,0.778580,0.030374,0.168956,0.484415,1.371370,0.050504,1.288859,0.467992,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.788110,-1.365195,-1.316654,0.030374,-1.712997,-0.250528,-0.785485,-0.238955,1.388429,-0.393843,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.311985,-1.365195,0.778580,0.030374,2.050909,-0.538645,0.946002,0.397854,-1.499119,0.934819,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.788110,0.270934,-1.316654,1.686677,2.050909,0.308485,-0.716777,-0.094226,1.189288,-0.393843,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7302,2.311985,0.270934,-1.316654,-1.625929,0.168956,-0.033682,-0.451312,0.368908,0.293152,1.042548,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7303,-0.735764,0.270934,0.778580,-1.625929,-1.712997,7.310954,0.781299,2.684579,-1.499119,-0.645211,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7304,-0.735764,1.907064,-1.316654,0.030374,-1.712997,-0.352059,-0.869809,-0.586306,1.089717,0.072984,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
7305,-0.735764,0.270934,-1.316654,0.030374,2.050909,-0.569899,-0.471727,0.397854,1.587571,0.432082,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## KNN and LM function

define a function that takes a list of models and train (and tests) them

In [41]:
#from sklearn.neighbors import KNeighborsRegressor
#from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

#If 'X_train' and 'X_test' are NumPy arrays, you don't need to use '.values' at all.
def knn_linear_scores(X_train, X_test, y_train, y_test):
    # Initialize models
    linear_regressor = LinearRegression()
    knn_regressor = KNeighborsRegressor()
    
    # Train models
    linear_regressor.fit(X_train, y_train)
    knn_regressor.fit(X_train, y_train)
    
    # Predictions
    linear_preds = linear_regressor.predict(X_test)
    knn_preds = knn_regressor.predict(X_test)
    
    # Calculate mean squared errors
    linear_r2 = r2_score(y_test, linear_preds)
    knn_r2 = knn_regressor.score(X_test,y_test)
   #print
    print("Linear R2 score is:", linear_r2)
    print("KNN R2 score is:", knn_r2)
    return linear_r2, knn_r2

In [42]:
knn_linear_scores(X_train, X_test, y_train, y_test)

Linear R2 score is: 0.6099862973935695
KNN R2 score is: 0.735077287266806


(0.6099862973935695, 0.735077287266806)