In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('synthetic_insurance_data.csv')
df.head()

Unnamed: 0,Age,Is_Senior,Marital_Status,Married_Premium_Discount,Prior_Insurance,Prior_Insurance_Premium_Adjustment,Claims_Frequency,Claims_Severity,Claims_Adjustment,Policy_Type,...,Time_Since_First_Contact,Conversion_Status,Website_Visits,Inquiries,Quotes_Requested,Time_to_Conversion,Credit_Score,Premium_Adjustment_Credit,Region,Premium_Adjustment_Region
0,47,0,Married,86,1-5 years,50,0,Low,0,Full Coverage,...,10,0,5,1,2,99,704,-50,Suburban,50
1,37,0,Married,86,1-5 years,50,0,Low,0,Full Coverage,...,22,0,5,1,2,99,726,-50,Urban,100
2,49,0,Married,86,1-5 years,50,1,Low,50,Full Coverage,...,28,0,4,4,1,99,772,-50,Urban,100
3,62,1,Married,86,>5 years,0,1,Low,50,Full Coverage,...,4,1,6,2,2,2,809,-50,Urban,100
4,36,0,Single,0,>5 years,0,2,Low,100,Full Coverage,...,14,1,8,4,2,10,662,50,Suburban,50


In [4]:
df.shape

(10000, 27)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 27 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Age                                 10000 non-null  int64 
 1   Is_Senior                           10000 non-null  int64 
 2   Marital_Status                      10000 non-null  object
 3   Married_Premium_Discount            10000 non-null  int64 
 4   Prior_Insurance                     10000 non-null  object
 5   Prior_Insurance_Premium_Adjustment  10000 non-null  int64 
 6   Claims_Frequency                    10000 non-null  int64 
 7   Claims_Severity                     10000 non-null  object
 8   Claims_Adjustment                   10000 non-null  int64 
 9   Policy_Type                         10000 non-null  object
 10  Policy_Adjustment                   10000 non-null  int64 
 11  Premium_Amount                      10000 non-null  int

In [7]:
df["Conversion_Status"]

0       0
1       0
2       0
3       1
4       1
       ..
9995    1
9996    1
9997    1
9998    1
9999    1
Name: Conversion_Status, Length: 10000, dtype: int64

In [16]:
int_columns = df.select_dtypes(include='int64').columns
int_columns

Index(['Age', 'Is_Senior', 'Married_Premium_Discount',
       'Prior_Insurance_Premium_Adjustment', 'Claims_Frequency',
       'Claims_Adjustment', 'Policy_Adjustment', 'Premium_Amount',
       'Safe_Driver_Discount', 'Multi_Policy_Discount', 'Bundling_Discount',
       'Total_Discounts', 'Time_Since_First_Contact', 'Conversion_Status',
       'Website_Visits', 'Inquiries', 'Quotes_Requested', 'Time_to_Conversion',
       'Credit_Score', 'Premium_Adjustment_Credit',
       'Premium_Adjustment_Region'],
      dtype='object')

Index(['Age', 'Is_Senior', 'Marital_Status', 'Married_Premium_Discount',
       'Prior_Insurance', 'Prior_Insurance_Premium_Adjustment',
       'Claims_Frequency', 'Claims_Severity', 'Claims_Adjustment',
       'Policy_Type', 'Policy_Adjustment', 'Premium_Amount',
       'Safe_Driver_Discount', 'Multi_Policy_Discount', 'Bundling_Discount',
       'Total_Discounts', 'Source_of_Lead', 'Time_Since_First_Contact',
       'Conversion_Status', 'Website_Visits', 'Inquiries', 'Quotes_Requested',
       'Time_to_Conversion', 'Credit_Score', 'Premium_Adjustment_Credit',
       'Region', 'Premium_Adjustment_Region'],
      dtype='object')

In [11]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 27 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Age                                 10000 non-null  int64 
 1   Is_Senior                           10000 non-null  int64 
 2   Marital_Status                      10000 non-null  object
 3   Married_Premium_Discount            10000 non-null  int64 
 4   Prior_Insurance                     10000 non-null  object
 5   Prior_Insurance_Premium_Adjustment  10000 non-null  int64 
 6   Claims_Frequency                    10000 non-null  int64 
 7   Claims_Severity                     10000 non-null  object
 8   Claims_Adjustment                   10000 non-null  int64 
 9   Policy_Type                         10000 non-null  object
 10  Policy_Adjustment                   10000 non-null  int64 
 11  Premium_Amount                      10000 non-null  int

In [4]:
obj_columns = df.select_dtypes(include='object').columns
obj_columns

Index(['Marital_Status', 'Prior_Insurance', 'Claims_Severity', 'Policy_Type',
       'Source_of_Lead', 'Region'],
      dtype='object')

In [10]:
df["Source_of_Lead"].value_counts()

Source_of_Lead
Online      6035
Agent       3004
Referral     961
Name: count, dtype: int64

In [7]:
df["Prior_Insurance"].value_counts()

Prior_Insurance
1-5 years    5257
>5 years     2609
<1 year      2134
Name: count, dtype: int64

In [8]:
df["Claims_Severity"].value_counts()

Claims_Severity
Low       7003
Medium    2038
High       959
Name: count, dtype: int64

In [9]:
df["Policy_Type"].value_counts()

Policy_Type
Full Coverage     6007
Liability-Only    3993
Name: count, dtype: int64

In [26]:
data = ['Age', 'Is_Senior', 'Credit_Score', 'Prior_Insurance', 'Claims_Frequency', 'Policy_Type', 'Premium_Amount', 'Total_Discounts', 'Source_of_Lead', 'Website_Visits', 'Inquiries', 'Quotes_Requested', 'Time_Since_First_Contact']
encoder = LabelEncoder()

df['Prior_Insurance'] = encoder.fit_transform(df['Prior_Insurance'])
df["Policy_Type"] = encoder.fit_transform(df['Policy_Type'])
df["Source_of_Lead"] = encoder.fit_transform(df['Source_of_Lead'])


In [27]:
X = df[data]
y = df['Conversion_Status']

In [28]:
model = KNeighborsClassifier(n_neighbors=3)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [30]:
model.fit(X_train, y_train)

In [31]:
y_pred = model.predict(X_test)

In [32]:
# Print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n{conf_matrix}')

# Print the classification report
class_report = classification_report(y_test, y_pred)
print(f'Classification Report:\n{class_report}')

Accuracy: 0.517
Confusion Matrix:
[[ 504  779]
 [ 670 1047]]
Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.39      0.41      1283
           1       0.57      0.61      0.59      1717

    accuracy                           0.52      3000
   macro avg       0.50      0.50      0.50      3000
weighted avg       0.51      0.52      0.51      3000



In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 27 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Age                                 10000 non-null  int64 
 1   Is_Senior                           10000 non-null  int64 
 2   Marital_Status                      10000 non-null  object
 3   Married_Premium_Discount            10000 non-null  int64 
 4   Prior_Insurance                     10000 non-null  object
 5   Prior_Insurance_Premium_Adjustment  10000 non-null  int64 
 6   Claims_Frequency                    10000 non-null  int64 
 7   Claims_Severity                     10000 non-null  object
 8   Claims_Adjustment                   10000 non-null  int64 
 9   Policy_Type                         10000 non-null  object
 10  Policy_Adjustment                   10000 non-null  int64 
 11  Premium_Amount                      10000 non-null  int