In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

In [2]:
def wrangle(filepath):
    df = pd.read_csv(filepath)
    
    return df

In [3]:
df = wrangle("./Datasets/train.csv")

In [4]:
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [5]:
df.select_dtypes("object").head()

Unnamed: 0,Gender,Vehicle_Age,Vehicle_Damage
0,Male,> 2 Years,Yes
1,Male,1-2 Year,No
2,Male,> 2 Years,Yes
3,Male,< 1 Year,No
4,Female,< 1 Year,No


In [6]:
map_col = {"Gender": {"Male": 1, "Female": 0},
          "Vehicle_Age": {"< 1 Year": 0, "1-2 Year": 1, "> 2 Years": 2},
          "Vehicle_Damage": {"Yes": 1, "No": 0}}

In [7]:
df.replace(map_col, inplace=True)

In [8]:
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,1,44,1,28.0,0,2,1,40454.0,26.0,217,1
1,2,1,76,1,3.0,0,1,0,33536.0,26.0,183,0
2,3,1,47,1,28.0,0,2,1,38294.0,26.0,27,1
3,4,1,21,1,11.0,1,0,0,28619.0,152.0,203,0
4,5,0,29,1,41.0,1,0,0,27496.0,152.0,39,0


In [9]:
df.isnull().sum()

id                      0
Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64

In [10]:
df["Vehicle_Age"].unique()

array([2, 1, 0], dtype=int64)

# Splitting

In [11]:
target = "Response"
X = df.drop(columns=[target, "id"])
y = df[target]

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

In [13]:
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

(285831, 10)
(95278, 10)
(285831,)
(95278,)


# Model Training

In [14]:
dataframe_model = pd.DataFrame(columns=["Model Name","Accuracy Score"])

# Logistic Regression Model

In [15]:
lr_model = LogisticRegression()

In [16]:
lr_model.fit(X_train, y_train)

LogisticRegression()

In [17]:
prediction = lr_model.predict(X_train)

In [18]:
score = accuracy_score(y_train, prediction)
print(score)

0.8734846815076042


In [19]:
new_row = {"Model Name":"Logistic Regression", "Accuracy Score":score}
dataframe_model = dataframe_model.append(new_row, ignore_index=True)

  dataframe_model = dataframe_model.append(new_row, ignore_index=True)


# Decision Tree Classifier

In [20]:
dtc_model = DecisionTreeClassifier()

In [21]:
dtc_model.fit(X_train, y_train)

DecisionTreeClassifier()

In [22]:
prediction = dtc_model.predict(X_train)

In [23]:
score = accuracy_score(y_train, prediction)
print(score)

0.9998950428749855


In [24]:
new_row = {"Model Name":"Decision Tree Classifier", "Accuracy Score":score}
dataframe_model = dataframe_model.append(new_row, ignore_index=True)

  dataframe_model = dataframe_model.append(new_row, ignore_index=True)


# Support Vector Classifier

In [25]:
svc_model = SVC()

In [None]:
svc_model.fit(X_train, y_train)

In [None]:
prediction = svc_model.predict(X_train)