In [42]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold # It only works with numerical features
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer



In [2]:
df=pd.read_csv("Data_Marketing_Customer_Analysis_Round3.csv")

In [5]:
df.head()

Unnamed: 0,region,customer_lifetime_value,response,coverage,education,effective_to_date,month,employment_status,gender,income,...,months_since_policy_inception,number_of_open_complaints,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,total_claim_amount,vehicle_class,vehicle_size
0,central,4809,no,basic,college,2/18/11,feb,employed,m,48029,...,52,0,9,corporate auto,corporate l3,offer3,agent,292,four-door car,medsize
1,west region,2228,no,basic,college,1/18/11,jan,unemployed,f,92260,...,26,0,1,personal auto,personal l3,offer4,call center,744,four-door car,medsize
2,east,14947,no,basic,bachelor,2/10/11,feb,employed,m,22139,...,31,0,2,personal auto,personal l3,offer3,call center,480,suv,medsize
3,north west,22332,yes,extended,college,1/11/11,jan,employed,m,49078,...,3,0,2,corporate auto,corporate l3,offer2,branch,484,four-door car,medsize
4,north west,9025,no,premium,bachelor,1/17/11,jan,medical leave,f,23675,...,31,0,7,personal auto,personal l2,offer1,branch,707,four-door car,medsize


In [7]:
X = df.drop(columns=['total_claim_amount'], axis = 1)
y = np.log(df['total_claim_amount'])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

In [12]:
X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

print("Initial number of numerical columns: ",X_train.shape)
print()


selector = VarianceThreshold(threshold=100) # Default threshold value is 0
# Features with a training-set variance lower than this threshold will be removed.
selector.fit(X_train)

kept_features_indexes = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features_indexes].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final number of numerical columns: ",X_train.shape)
print()
X_train

Initial number of numerical columns:  (8551, 7)

Final number of numerical columns:  (8551, 5)



Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception
0,21423,22379,65,9,31
1,8391,40211,106,5,98
2,3969,49544,101,3,29
3,14914,45963,63,3,73
4,18060,57882,115,1,61
...,...,...,...,...,...
8546,7610,98701,94,22,66
8547,35186,86134,98,17,78
8548,4241,19834,64,26,8
8549,12941,77060,106,23,90


In [15]:
c = abs(df.corr())


#fig, ax = plt.subplots(figsize=(14,14))
#sns.heatmap(c, annot=True);

c_last = c['total_claim_amount'].sort_values(ascending=False)

c_thr = .3
cols_to_keep = list(c_last[c_last > c_thr].index)[1:] + [list(c_last[c_last > c_thr].index)[0]]
print(cols_to_keep)

df[cols_to_keep]

['monthly_premium_auto', 'total_claim_amount']


Unnamed: 0,monthly_premium_auto,total_claim_amount
0,61,292
1,64,744
2,100,480
3,97,484
4,117,707
...,...,...
10684,253,1214
10685,65,273
10686,201,381
10687,158,618


In [88]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

nulls = pd.DataFrame(X_train.isna().sum()).reset_index()

nulls.columns = ['Column','nas']

cols_to_drop = nulls[nulls['nas'] > 0]['Column'] # Too drastic, but made on pourpose for quick filtering (don't do this in production!!)


In [89]:
nulls = pd.DataFrame(X_train.isna().sum()).reset_index()

In [90]:
X_train.drop(columns=list(cols_to_drop), axis=1, inplace = True)
X_test.drop(columns=list(cols_to_drop), axis=1, inplace = True)


In [70]:
#We need to remove the dumb infinities

In [None]:
y_train[y_train==float('-inf')]

In [103]:
y_train=y_train.drop([10317,4052,2759,6113,5853,7447,4485,6865,3628],axis=0)

In [104]:
X_train=X_train.drop([10317,4052,2759,6113,5853,7447,4485,6865,3628],axis=0)

In [107]:
lm = LinearRegression()

selector = RFE(lm, n_features_to_select= 5, step = 1, verbose = 1) # Step is how many features to add or drop everytime
selector.fit(X_train, y_train)

kept_features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final selected features: ")
display(X_train)

Fitting estimator with 7 features.
Fitting estimator with 6 features.
Final selected features: 


Unnamed: 0,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
0,65,9,31,0,2
1,106,5,98,2,6
2,63,3,73,2,2
3,115,1,61,0,2
4,73,35,14,0,2
...,...,...,...,...,...
8537,94,22,66,0,3
8538,98,17,78,0,2
8539,64,26,8,4,8
8540,106,23,90,0,2


In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

In [109]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train=imp_mean.fit_transform(X_train)

In [110]:
X_train

array([[8.6630e+03, 4.2169e+04, 8.3000e+01, ..., 9.0000e+01, 1.0000e+00,
        2.0000e+00],
       [4.2130e+03, 1.2160e+04, 1.0900e+02, ..., 3.4000e+01, 0.0000e+00,
        1.0000e+00],
       [2.3590e+03, 1.9864e+04, 6.3000e+01, ..., 9.6000e+01, 0.0000e+00,
        1.0000e+00],
       ...,
       [4.2410e+03, 1.9834e+04, 6.4000e+01, ..., 8.0000e+00, 4.0000e+00,
        8.0000e+00],
       [1.2941e+04, 7.7060e+04, 1.0600e+02, ..., 9.0000e+01, 0.0000e+00,
        2.0000e+00],
       [6.9470e+03, 6.3406e+04, 1.0000e+02, ..., 5.4000e+01, 0.0000e+00,
        7.0000e+00]])

In [124]:
X_train=X_train.drop([4052,2759,6113,5853,7447,4485,6865,3628],axis=0)

In [114]:
X_train=pd.DataFrame(X_train)
y_train=pd.DataFrame(y_train)

y_train=y_train.drop([10317,4052,2759,6113,5853,7447,4485,6865,3628],axis=0)
X_train=X_train.drop([10317,4052,2759,6113,5853,7447,4485,6865,3628],axis=0)

KeyError: '[10317, 4052, 2759, 6113, 5853, 7447, 4485, 6865, 3628] not found in axis'

In [151]:
model=LinearRegression()
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
df['score'].value_counts()[77]

In [139]:
X_train

Unnamed: 0,0,1,2,3,4,5,6
0,8663.0,42169.0,83.0,18.0,90.0,1.0,2.0
1,4213.0,12160.0,109.0,5.0,34.0,0.0,1.0
2,2359.0,19864.0,63.0,22.0,96.0,0.0,1.0
3,19511.0,40625.0,70.0,28.0,26.0,0.0,2.0
4,3576.0,24959.0,89.0,19.0,13.0,0.0,1.0
...,...,...,...,...,...,...,...
7477,7610.0,98701.0,94.0,22.0,66.0,0.0,3.0
7478,35186.0,86134.0,98.0,17.0,78.0,0.0,2.0
7479,4241.0,19834.0,64.0,26.0,8.0,4.0,8.0
7480,12941.0,77060.0,106.0,23.0,90.0,0.0,2.0


In [137]:
y_train["total_claim_amount"].unique()

array([5.98645201, 6.19236249, 5.86363118, ..., 7.92262357, 6.94793707,
       6.82979374])

In [142]:
y_train

Unnamed: 0,total_claim_amount
10237,5.986452
2689,6.192362
4489,5.863631
10401,6.196444
749,6.461468
...,...
5734,4.369448
5191,1.098612
5390,6.131226
860,6.148468
