In [1]:
import pandas as pd


In [2]:
numerical_df = pd.read_csv("numerical_df.csv")
categorical_df_clean = pd.read_csv("categorical_df_clean.csv")


In [3]:
data = pd.concat([numerical_df, categorical_df_clean], axis=1)


In [4]:
print(data.shape)
data.head()


(9134, 20)


Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount,state,response,coverage,education,employment_status,gender,location_code,marital_status,policy_type,renew_offer_type,sales_channel,vehicle_class
0,2763.519279,56274,69,32,5,0,1,384.811147,Washington,No,Basic,Bachelor,Employed,F,Suburban,Married,Corporate Auto,Offer1,Agent,Two-Door Car
1,6979.535903,0,94,13,42,0,8,1131.464935,Arizona,No,Extended,Bachelor,Unemployed,F,Suburban,Single,Personal Auto,Offer3,Agent,Four-Door Car
2,12887.43165,48767,108,18,38,0,2,566.472247,Nevada,No,Premium,Bachelor,Employed,F,Suburban,Married,Personal Auto,Offer1,Agent,Two-Door Car
3,7645.861827,0,106,18,65,0,7,529.881344,California,No,Basic,Bachelor,Unemployed,M,Suburban,Married,Corporate Auto,Offer1,Call Center,SUV
4,2813.692575,43836,73,12,44,0,1,138.130879,Washington,No,Basic,Bachelor,Employed,M,Rural,Single,Personal Auto,Offer1,Agent,Four-Door Car


In [5]:
data["customer_lifetime_value"] = data["customer_lifetime_value"].round(2)
data["total_claim_amount"] = data["total_claim_amount"].round(2)


In [6]:
data["education"].unique()


array(['Bachelor', 'College', 'Master', 'High School or Below', 'Doctor'],
      dtype=object)

In [7]:
data["education"] = data["education"].map({"High School or Below": 1, "College": 2, "Bachelor": 3, "Master": 4, "Doctor": 5})


In [8]:
data["education"].unique()

array([3, 2, 4, 1, 5], dtype=int64)

In [9]:
data["coverage"].unique()


array(['Basic', 'Extended', 'Premium'], dtype=object)

In [10]:
data["coverage"] = data["coverage"].map({"Basic": 1, "Extended": 2, "Premium": 3})


In [11]:
data["coverage"].unique()


array([1, 2, 3], dtype=int64)

In [12]:
X = data.drop(["total_claim_amount"], axis=1)
y = data["total_claim_amount"]


In [19]:
data.shape


(9134, 20)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)


In [14]:
import numpy as np

X_train_num = X_train.select_dtypes(np.number)
X_test_num = X_test.select_dtypes(np.number)
X_train_cat = X_train.select_dtypes(object)
X_test_cat = X_test.select_dtypes(object)


In [21]:
print(X_train_num.shape)
print(X_test_num.shape)
print(X_train_cat.shape)
print(X_test_cat.shape)


(7307, 9)
(1827, 9)
(7307, 10)
(1827, 10)


In [15]:
from sklearn.preprocessing import StandardScaler

transformer = StandardScaler().fit(X_train_num)
X_train_num_transformed = transformer.transform(X_train_num)
X_test_num_transformed = transformer.transform(X_test_num)


In [16]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown="error", drop="first").fit(X_train_cat)
X_train_cat_encoded = encoder.transform(X_train_cat).toarray()
X_test_cat_encoded = encoder.transform(X_test_cat).toarray()


In [17]:
X_train_concatenated = np.concatenate((X_train_num_transformed, X_train_cat_encoded), axis=1)
X_test_concatenated = np.concatenate((X_test_num_transformed, X_test_cat_encoded), axis=1)


In [18]:
pd.DataFrame(X_train_concatenated)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,-0.393751,1.430320,-0.674132,0.487904,-1.571541,4.007612,0.014021,-0.729204,-0.266720,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.793486,0.140556,-0.791335,-0.007017,0.036557,0.685590,-0.825766,-0.729204,1.591166,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.461832,-0.009575,0.849512,0.784856,-0.356534,0.685590,-0.825766,0.801475,0.662223,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.001747,-1.243270,0.409999,-1.194827,0.608325,-0.421750,0.014021,-0.729204,-0.266720,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.781660,1.404364,0.556503,-0.897874,-0.285063,0.685590,-0.405872,-0.729204,0.662223,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7302,-0.462069,-0.028275,-0.937839,0.685872,1.608920,-0.421750,0.014021,-0.729204,-1.195663,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7303,-0.687850,1.078938,-0.293221,0.982824,-1.249922,-0.421750,-0.825766,0.801475,0.662223,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7304,-0.304788,1.733714,-0.556928,0.190951,-0.428005,-0.421750,2.113491,-0.729204,-0.266720,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
7305,-0.445663,1.251562,-0.820636,-0.996858,0.715532,2.900271,2.533384,-0.729204,0.662223,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
