In [3]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE  ## recursive feature elemination technique
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


# setup data

In [4]:
#df = pd.read_csv("Data_Marketing_Customer_Analysis_Round3.csv")
df = pd.read_csv("dum_num_df.csv")

## X, y split

In [5]:
X = df.drop("total_claim_amount", axis=1)
y = df["total_claim_amount"]

## test-train split

In [6]:
def split_the_shit(X, y):
    # split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
    # select numerical columns
    X_train = pd.DataFrame(X_train, columns=X.columns)
    X_test  = pd.DataFrame(X_test, columns=X.columns)

    # now transform with StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_scaled = scaler.transform(X_train)

    return X_train, X_test, y_train, y_test

In [7]:
X_train, X_test, y_train, y_test = split_the_shit(X, y)

# train model

## OLS

In [8]:
model=LinearRegression()
results = model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

LinearRegression: Train -> 0.7652324987539796, Test -> 0.7831151759025022


## lasso model - could drop features and make it a feature selection technique

In [12]:
model=Lasso(alpha=0.05)

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Lasso: Train -> 0.39672285194698953, Test -> 0.39875160127895637


## Ridge

In [25]:
model=Ridge(alpha=0.05)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Ridge: Train -> 0.7652324970887974, Test -> 0.7831158119273929


## ElasticNet

In [27]:
model=ElasticNet(alpha=0.005)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

ElasticNet: Train -> 0.7650323794369853, Test -> 0.7830956916869393


# function to train different models

In [28]:
model_lst = [LinearRegression(), Lasso(alpha=0.05), Ridge(alpha=0.05), ElasticNet(alpha=0.05)]

def train_models(model_lst, X_train, y_train):

    for i, item in enumerate(model_lst):
        model = item
        model.fit(X_train, y_train)
        print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")


In [29]:
train_models(model_lst, X_train, y_train)

LinearRegression: Train -> 0.7652324987539796, Test -> 0.7831151759025022
Lasso: Train -> 0.765208084512232, Test -> 0.7831917511167968
Ridge: Train -> 0.7652324970887974, Test -> 0.7831158119273929
ElasticNet: Train -> 0.754667521673609, Test -> 0.7733685892249207


# apply feature selection

## RFE

In [61]:
#X_train.isna().sum()
nulls = pd.DataFrame(X_train.isna().sum()).reset_index()
#nulls.head()
nulls.columns = ['Column','nas']
#nulls.head()
#nulls[nulls['nas'] > 0].head()
cols_to_drop = nulls[nulls['nas'] > 0]['Column'] # Too drastic, but made on purpose for quick filtering (don't do this in production!!)

X_train.drop(columns=cols_to_drop, axis=1, inplace = True)
X_test.drop(columns=cols_to_drop, axis=1, inplace = True)

#display(X_train)

lm = LinearRegression()

selector = RFE(lm, n_features_to_select= 24, step = 1, verbose = 1) # Step is how many features to add or drop everytime
selector.fit(X_train, y_train)

kept_features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final selected features: ")
display(X_train)

Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Final selected features: 


Unnamed: 0,coverage,education,response_yes,employment_status_employed,employment_status_medical leave,employment_status_retired,employment_status_unemployed,gender_m,location_code_suburban,location_code_urban,...,policy_special l1,policy_special l2,policy_special l3,renew_offer_type_offer2,renew_offer_type_offer3,renew_offer_type_offer4,vehicle_class_luxury suv,vehicle_class_sports car,vehicle_class_suv,monthly_premium_auto
0,1,1,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,0,0,83
1,2,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,109
2,0,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,63
3,0,1,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,0,0,70
4,1,0,0,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7477,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,94
7478,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,98
7479,0,2,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,64
7480,0,0,0,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,1,106


In [56]:
train_models(model_lst, X_train, y_train)

LinearRegression: Train -> 0.6285664420581314, Test -> 0.6309069406281262
Lasso: Train -> 0.6285490164403559, Test -> 0.6310293789146353
Ridge: Train -> 0.6285664297155302, Test -> 0.6309085259364577
ElasticNet: Train -> 0.5861822332567612, Test -> 0.5936891901514362


In [36]:
X_train

Unnamed: 0,coverage,education,response_yes,employment_status_employed,employment_status_retired,employment_status_unemployed,gender_m,location_code_suburban,location_code_urban,marital_status_single,policy_type_special auto,policy_corporate l3,policy_personal l1,policy_special l1,policy_special l2,policy_special l3,renew_offer_type_offer4,vehicle_class_luxury suv,vehicle_class_sports car,vehicle_class_suv
0,1,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
2,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0
4,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7477,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
7478,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
7479,0,2,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
7480,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1
