In [51]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE  ## recursive feature elemination technique
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


# setup data

In [98]:
df = pd.read_csv("Data_Marketing_Customer_Analysis_Round3.csv")

## X, y split

In [99]:
X = df.drop("total_claim_amount", axis=1)
y = df["total_claim_amount"]

ValueError: could not convert string to float: 'central'

## test-train split

In [81]:
def split_the_shit(X, y):

    X_num = X.select_dtypes(include=np.number)
    X_cat = X.select_dtypes(include=np.object)

    X_cat = pd.get_dummies(X_cat)

    X = pd.concat([X_num, X_cat], axis=1)

    scaler = StandardScaler()
    scaler.fit(X)
    X_scaled = scaler.transform(X)

    # split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
    # select numerical columns
    X_train = pd.DataFrame(X_train, columns=X.columns)
    X_test  = pd.DataFrame(X_test, columns=X.columns)
    return X_train, X_test, y_train, y_test

In [100]:
X_train, X_test, y_train, y_test = split_the_shit(X, y)

# train model

## OLS

In [83]:
model=LinearRegression()
results = model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

LinearRegression: Train -> 0.7682940698473952, Test -> 0.7820756820640806


## lasso model - could drop features and make it a feature selection technique

In [107]:
model=Lasso(alpha=0.05)

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Lasso: Train -> 0.7681890723396153, Test -> 0.7825854326242125


## Ridge

In [78]:
model=Ridge(alpha=10000)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Ridge: Train -> 0.1366520450129276, Test -> 0.1404892132821094


## ElasticNet

In [64]:
model=ElasticNet(alpha=0.1)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

ElasticNet: Train -> 0.7603123416377466, Test -> 0.7774050392611072


# function to train different models

In [42]:
model_lst = [LinearRegression(), Lasso(alpha=0.05), Ridge(alpha=10000), ElasticNet(alpha=0.1)]

def train_models(model_lst, X_train, y_train):

    for i, item in enumerate(model_lst):
        model = item
        model.fit(X_train, y_train)
        print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")


In [43]:
train_models(model_lst, X_train, y_train)

LinearRegression: Train -> 0.4086926440650056, Test -> 0.4114006217055389
Lasso: Train -> 0.4086926032425868, Test -> 0.41141620918570554
Ridge: Train -> 0.40867692596088767, Test -> 0.41168523050262207
ElasticNet: Train -> 0.40869238511331907, Test -> 0.4114386909519817


# apply feature selection

## RFE

In [96]:
#X_train.isna().sum()
nulls = pd.DataFrame(X_train.isna().sum()).reset_index()
#nulls.head()
nulls.columns = ['Column','nas']
#nulls.head()
#nulls[nulls['nas'] > 0].head()
cols_to_drop = nulls[nulls['nas'] > 0]['Column'] # Too drastic, but made on pourpose for quick filtering (don't do this in production!!)

X_train.drop(columns=cols_to_drop, axis=1, inplace = True)
X_test.drop(columns=cols_to_drop, axis=1, inplace = True)

#display(X_train)

lm = LinearRegression()

selector = RFE(lm, n_features_to_select= 30, step = 1, verbose = 1) # Step is how many features to add or drop everytime
selector.fit(X_train, y_train)

kept_features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final selected features: ")
display(X_train)

Fitting estimator with 124 features.
Fitting estimator with 123 features.
Fitting estimator with 122 features.
Fitting estimator with 121 features.
Fitting estimator with 120 features.
Fitting estimator with 119 features.
Fitting estimator with 118 features.
Fitting estimator with 117 features.
Fitting estimator with 116 features.
Fitting estimator with 115 features.
Fitting estimator with 114 features.
Fitting estimator with 113 features.
Fitting estimator with 112 features.
Fitting estimator with 111 features.
Fitting estimator with 110 features.
Fitting estimator with 109 features.
Fitting estimator with 108 features.
Fitting estimator with 107 features.
Fitting estimator with 106 features.
Fitting estimator with 105 features.
Fitting estimator with 104 features.
Fitting estimator with 103 features.
Fitting estimator with 102 features.
Fitting estimator with 101 features.
Fitting estimator with 100 features.
Fitting estimator with 99 features.
Fitting estimator with 98 features.
Fit

Unnamed: 0,region_central,region_east,region_north west,region_west region,response_no,response_yes,coverage_basic,coverage_extended,coverage_premium,education_bachelor,...,policy_corporate l1,policy_corporate l2,policy_corporate l3,policy_personal l1,policy_personal l2,policy_personal l3,sales_channel_agent,sales_channel_branch,sales_channel_call center,sales_channel_web
0,1,0,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
1,1,0,0,0,1,0,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0
2,0,0,1,0,0,1,1,0,0,1,...,0,0,0,0,1,0,0,0,0,1
3,1,0,0,0,1,0,1,0,0,1,...,0,0,1,0,0,0,0,0,0,1
4,0,1,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7477,1,0,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
7478,0,0,0,1,1,0,0,1,0,1,...,0,0,1,0,0,0,1,0,0,0
7479,0,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,1,1,0,0,0
7480,0,0,1,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [101]:
train_models(model_lst, X_train, y_train)

LinearRegression: Train -> 0.7682940698473952, Test -> 0.7820756820640806
Lasso: Train -> 0.7681890723396153, Test -> 0.7825854326242125
Ridge: Train -> 0.5661154492997775, Test -> 0.5771757082581657
ElasticNet: Train -> 0.7603123416377466, Test -> 0.7774050392611072


In [49]:
X_train

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
0,8663,42169,83,18,90,1,2
1,4213,12160,109,5,34,0,1
2,2359,19864,63,22,96,0,1
3,19511,40625,70,28,26,0,2
4,3576,24959,89,19,13,0,1
...,...,...,...,...,...,...,...
7477,7610,98701,94,22,66,0,3
7478,35186,86134,98,17,78,0,2
7479,4241,19834,64,26,8,4,8
7480,12941,77060,106,23,90,0,2
