In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('/Users/noachmeged/Documents/Ironhack/Labs/lab-customer-analysis-round-7/files_for_lab/csv_files/marketing_customer_analysis.csv')
data

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.431650,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,LA72316,California,23405.987980,No,Basic,Bachelor,2/10/11,Employed,M,71941,...,89,0,2,Personal Auto,Personal L1,Offer2,Web,198.234764,Four-Door Car,Medsize
9130,PK87824,California,3096.511217,Yes,Extended,College,2/12/11,Employed,F,21604,...,28,0,1,Corporate Auto,Corporate L3,Offer1,Branch,379.200000,Four-Door Car,Medsize
9131,TD14365,California,8163.890428,No,Extended,Bachelor,2/6/11,Unemployed,M,0,...,37,3,2,Corporate Auto,Corporate L2,Offer1,Branch,790.784983,Four-Door Car,Medsize
9132,UP19263,California,7524.442436,No,Extended,College,2/3/11,Employed,M,21941,...,3,0,3,Personal Auto,Personal L2,Offer3,Branch,691.200000,Four-Door Car,Large


In [4]:
# Standerdize
data.columns = data.columns.str.lower().str.replace(" ", "_")
data.columns

Index(['customer', 'state', 'customer_lifetime_value', 'response', 'coverage',
       'education', 'effective_to_date', 'employmentstatus', 'gender',
       'income', 'location_code', 'marital_status', 'monthly_premium_auto',
       'months_since_last_claim', 'months_since_policy_inception',
       'number_of_open_complaints', 'number_of_policies', 'policy_type',
       'policy', 'renew_offer_type', 'sales_channel', 'total_claim_amount',
       'vehicle_class', 'vehicle_size'],
      dtype='object')

In [7]:
numerical_data = data.select_dtypes("number")

categorical_data = data.select_dtypes("object")

In [10]:
X = data.drop(["customer", "total_claim_amount"], axis=1)
y = data["total_claim_amount"]

# Train-test split:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train


Unnamed: 0,state,customer_lifetime_value,response,coverage,education,effective_to_date,employmentstatus,gender,income,location_code,...,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size
434,Washington,5015.009472,Yes,Extended,Bachelor,2/3/11,Employed,F,48567,Suburban,...,12,15,0,1,Corporate Auto,Corporate L2,Offer2,Agent,SUV,Medsize
4641,Oregon,5149.301306,No,Extended,Bachelor,1/22/11,Employed,F,26877,Suburban,...,5,2,0,1,Personal Auto,Personal L2,Offer4,Agent,SUV,Small
4952,California,4904.894731,Yes,Extended,College,2/14/11,Retired,F,12902,Suburban,...,3,51,0,1,Personal Auto,Personal L3,Offer1,Agent,Sports Car,Medsize
1489,Arizona,8510.525936,No,Extended,College,1/8/11,Unemployed,F,0,Suburban,...,5,94,0,8,Personal Auto,Personal L2,Offer2,Branch,Sports Car,Medsize
812,Arizona,3278.531880,No,Extended,Doctor,2/19/11,Employed,M,70247,Rural,...,13,19,1,1,Personal Auto,Personal L2,Offer4,Call Center,Four-Door Car,Medsize
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,California,7334.328083,No,Basic,College,1/29/11,Employed,F,87957,Suburban,...,31,63,0,2,Personal Auto,Personal L3,Offer3,Agent,Four-Door Car,Small
5191,California,5498.940679,Yes,Basic,College,2/5/11,Medical Leave,M,22520,Suburban,...,17,64,0,3,Personal Auto,Personal L3,Offer1,Branch,Four-Door Car,Medsize
5390,California,8992.779137,No,Extended,High School or Below,1/27/11,Unemployed,F,0,Suburban,...,13,4,0,7,Personal Auto,Personal L1,Offer1,Call Center,SUV,Medsize
860,Arizona,14635.451580,Yes,Extended,Bachelor,2/27/11,Unemployed,F,0,Suburban,...,5,56,0,2,Personal Auto,Personal L3,Offer1,Call Center,SUV,Medsize


In [33]:
# One Hot Encoding: mostly used for production-ready scripts

# Start the encoder object:
encoder = OneHotEncoder(drop='first')
# drop='first' parameter: with it we have the option to drop one of the columns,
# this is a good option since it would be an unecessary column for the model,
# but it can get tricky to put it back as a dataframe, so we won't do it for now.

# Fit the encoder object to the categorical part of the DataFrame:
encoder.fit(data[["state", "gender"]]) # notice that the input needs to be 2 dimentional, so if we used a Serie instead of a DataFrame we would have a shape related error
# Now that the encoder is fitted, you could export it, using the *pickle* library for example, and load it into other scripts.

In [34]:
encoder.transform(data[["state", "gender"]]).todense()


matrix([[0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        ...,
        [1., 0., 0., 0., 1.],
        [1., 0., 0., 0., 1.],
        [1., 0., 0., 0., 1.]])

In [35]:
}

SyntaxError: unmatched '}' (70646322.py, line 1)