In [1]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.set_option('display.max_colwidth', None)
from scipy.spatial.distance import cdist
import warnings
warnings.filterwarnings('ignore')

In [82]:
df = pd.read_csv('insurancePrediction.csv')

In [46]:
df.shape

(278860, 20)

In [47]:
df.isna().sum()

Age                      4685
Gender                      0
Annual Income           13955
Marital Status           5019
Number of Dependents    27886
Education Level             0
Occupation              81288
Health Score            10597
Location                    0
Policy Type                 0
Previous Claims         81288
Vehicle Age                 0
Credit Score            27886
Insurance Duration          0
Premium Amount           1841
Policy Start Date           0
Customer Feedback       18349
Smoking Status              0
Exercise Frequency          0
Property Type               0
dtype: int64

In [48]:
df['Previous Claims'].value_counts()

Previous Claims
1.0    72925
0.0    72793
2.0    35983
3.0    12065
4.0     3063
5.0      625
6.0       94
7.0       21
9.0        2
8.0        1
Name: count, dtype: int64

In [43]:
df['Previous Claims'].isna().sum()

81288

In [44]:
df.shape[0]

278860

In [15]:
81/280

0.2892857142857143

In [None]:
# Age                      4685
# Annual Income           13955
# Marital Status           5019
# Number of Dependents    27886
# Occupation              81288
# Health Score            10597
# Previous Claims         81288
# Credit Score            27886
# Premium Amount           1841---------------------------------------
# Customer Feedback       18349

In [39]:
df.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Premium Amount,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,56.0,Male,99990.0,Married,1.0,Master's,,31.074627,Urban,Comprehensive,,13,320.0,5,308.0,2022-12-10 15:21:39.078837,Poor,Yes,Daily,Condo
1,46.0,Male,2867.0,Single,1.0,Bachelor's,,50.271335,Urban,Comprehensive,,3,694.0,4,517.0,2023-01-31 15:21:39.078837,Good,Yes,Monthly,House
2,32.0,Female,30154.0,Divorced,3.0,Bachelor's,,14.714909,Suburban,Comprehensive,2.0,16,652.0,8,849.0,2023-11-26 15:21:39.078837,Poor,No,Monthly,House
3,60.0,Female,48371.0,Divorced,0.0,PhD,Self-Employed,25.346926,Rural,Comprehensive,1.0,11,330.0,7,927.0,2023-02-27 15:21:39.078837,Poor,No,Rarely,Condo
4,25.0,Female,54174.0,Divorced,0.0,High School,Self-Employed,6.659499,Urban,Comprehensive,,9,,8,303.0,2020-11-25 15:21:39.078837,Poor,No,Rarely,Condo


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 196271 entries, 3 to 278859
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Age                   192986 non-null  float64
 1   Gender                196271 non-null  object 
 2   Annual Income         186421 non-null  float64
 3   Marital Status        192712 non-null  object 
 4   Number of Dependents  176719 non-null  float64
 5   Education Level       196271 non-null  object 
 6   Occupation            196271 non-null  object 
 7   Health Score          188839 non-null  float64
 8   Location              196271 non-null  object 
 9   Policy Type           196271 non-null  object 
 10  Previous Claims       139068 non-null  float64
 11  Vehicle Age           196271 non-null  int64  
 12  Credit Score          176602 non-null  float64
 13  Insurance Duration    196271 non-null  int64  
 14  Premium Amount        196271 non-null  float64
 15  Polic

In [83]:
class InsuranceDataCleaning:
    def __init__(self,df):
        self.df = df.copy()
        self.target = 'Premium Amount'

    def remove_nulls(self, column_name):
        self.df.dropna(subset=column_name, inplace=True)
        self.df.reset_index(drop=True, inplace = True)

    def convert_datetime(self, column):
        self.df[column] = pd.to_datetime(self.df[column])
        
    def to_categorical(self, column_list):
        for col_name in column_list:
            self.df[col_name] = self.df[col_name].astype('category')
            
    def handle_missing(self):
        categorical_cols = self.df.select_dtypes(include=['category']).columns.tolist()
        numerical_cols = self.df.select_dtypes(include=['int64','float64']).columns.tolist()

        remove_col = []
        remove_rows = []
        for col_name in numerical_cols:
            missing_count = self.df[col_name].isna().sum()
            total_count = self.df.shape[0]
            if (missing_count/total_count) >= 0.35:
                remove_col.append(col_name)
            elif (missing_count/total_count) <=0.05:
                remove_rows.append(col_name)
            else:
                self.df[col_name].fillna(value=self.df[col_name].mean(), inplace=True) 

        for col_name in categorical_cols:
            missing_count = self.df[col_name].isna().sum()
            total_count = self.df.shape[0]
            if (missing_count/total_count) >= 0.35:
                remove_col.append(col_name)
            elif (missing_count/total_count) <=0.05:
                remove_rows.append(col_name)
            else:
                self.df[col_name].fillna(value=self.df[col_name].value_counts().keys()[0], inplace=True)

        self.remove_nulls(remove_rows)
        self.df.drop(remove_col, axis=1, inplace=True)
        self.df.reset_index(drop=True)

    def preprocess(self):
        self.remove_nulls([self.target])
        self.convert_datetime('Policy Start Date')
        self.to_categorical(self.df.select_dtypes(include=['object']).columns.tolist()+\
                             ['Number of Dependents','Previous Claims','Insurance Duration'])
        self.handle_missing()
        return self.df
        

In [84]:
preprocessor = InsuranceDataCleaning(df)
cleaned_df = preprocessor.preprocess()

In [85]:
cleaned_df.shape

(257319, 20)

In [90]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257319 entries, 0 to 257318
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   Age                   257319 non-null  float64       
 1   Gender                257319 non-null  category      
 2   Annual Income         257319 non-null  float64       
 3   Marital Status        257319 non-null  category      
 4   Number of Dependents  257319 non-null  category      
 5   Education Level       257319 non-null  category      
 6   Occupation            257319 non-null  category      
 7   Health Score          257319 non-null  float64       
 8   Location              257319 non-null  category      
 9   Policy Type           257319 non-null  category      
 10  Previous Claims       257319 non-null  category      
 11  Vehicle Age           257319 non-null  int64         
 12  Credit Score          257319 non-null  float64       
 13 

In [93]:
import json
    
dtype_dict = cleaned_df.dtypes.apply(lambda x: x.name).to_dict()
print(dtype_dict.pop('Policy Start Date'))
print(dtype_dict)
with open('dtypes.json', 'w') as f:
    json.dump(dtype_dict, f)

datetime64[ns]
{'Age': 'float64', 'Gender': 'category', 'Annual Income': 'float64', 'Marital Status': 'category', 'Number of Dependents': 'category', 'Education Level': 'category', 'Occupation': 'category', 'Health Score': 'float64', 'Location': 'category', 'Policy Type': 'category', 'Previous Claims': 'category', 'Vehicle Age': 'int64', 'Credit Score': 'float64', 'Insurance Duration': 'category', 'Premium Amount': 'float64', 'Customer Feedback': 'category', 'Smoking Status': 'category', 'Exercise Frequency': 'category', 'Property Type': 'category'}


In [89]:
cleaned_df.to_csv('cleaned_df_insurance.csv', index = False)

In [88]:
((278860 - 257319)/278860)*100

7.724664706304239

In [86]:
cleaned_df.isna().sum()

Age                     0
Gender                  0
Annual Income           0
Marital Status          0
Number of Dependents    0
Education Level         0
Occupation              0
Health Score            0
Location                0
Policy Type             0
Previous Claims         0
Vehicle Age             0
Credit Score            0
Insurance Duration      0
Premium Amount          0
Policy Start Date       0
Customer Feedback       0
Smoking Status          0
Exercise Frequency      0
Property Type           0
dtype: int64

In [None]:
class InsuranceFeatureEngineering:
    def __init__(self, df):
        self.df = df.copy()
        

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

class InsuranceDataPreprocessor:
    def __init__(self, df):
        self.df = df.copy()
        self.categorical_cols = self.df.select_dtypes(include=['object']).columns.tolist()
        self.numerical_cols = self.df.select_dtypes(include=['number']).columns.tolist()
        self.label_encoders = {}
    
    def handle_missing_values(self):
        # Impute numerical columns with median
        num_imputer = SimpleImputer(strategy='median')
        self.df[self.numerical_cols] = num_imputer.fit_transform(self.df[self.numerical_cols])
        
        # Impute categorical columns with the most frequent value
        cat_imputer = SimpleImputer(strategy='most_frequent')
        self.df[self.categorical_cols] = cat_imputer.fit_transform(self.df[self.categorical_cols])
    
    def encode_categorical_features(self):
        # Label encoding for categorical features
        for col in self.categorical_cols:
            le = LabelEncoder()
            self.df[col] = le.fit_transform(self.df[col])
            self.label_encoders[col] = le
    
    def convert_dates(self):
        # Convert date columns to datetime format
        if 'Policy Start Date' in self.df.columns:
            self.df['Policy Start Date'] = pd.to_datetime(self.df['Policy Start Date'], errors='coerce')
            self.df['Policy Start Year'] = self.df['Policy Start Date'].dt.year
            self.df['Policy Start Month'] = self.df['Policy Start Date'].dt.month
            self.df['Policy Start Day'] = self.df['Policy Start Date'].dt.day
            self.df.drop(columns=['Policy Start Date'], inplace=True)
    
    def preprocess(self):
        self.handle_missing_values()
        self.encode_categorical_features()
        self.convert_dates()
        return self.df

# Usage Example
# df = pd.read_csv('insurance_data.csv')
# preprocessor = InsuranceDataPreprocessor(df)
# cleaned_df = preprocessor.preprocess()
# print(cleaned_df.head())
