### Add scripts path to the notebook

In [1]:
import sys
import os

current_dir = os.getcwd()
print(current_dir)

# Get the parent directory
parent_dir = os.path.dirname(current_dir)

scripts_path = os.path.join(parent_dir, 'scripts')

# Insert the path to the parent directory
sys.path.insert(0, parent_dir)

# Insert the path to the Scripts directory
sys.path.insert(0, scripts_path)

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..')))

d:\KifiyaAIM-Course\Week - 3\ACIS-Car-Insurance-Claim-Analysis\notebooks


### Import Statements

In [2]:
import math
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.cluster import KMeans
from sklearn.preprocessing import Normalizer

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

### Load the data

In [3]:
PATH_TO_DATA = "../data/MachineLearningRating_v3.txt"

In [4]:
data = pd.read_csv(filepath_or_buffer=PATH_TO_DATA, delimiter='|', low_memory=False)

### Data Cleaning

1) Find missing values

In [6]:
# find columns with na values
column_na_rations = data.isna().mean()

# print out columns that have missing values and the amount of missing values as percentages
column_na_rations[column_na_rations > 0].sort_values(ascending=False) * 100

NumberOfVehiclesInFleet    100.000000
CrossBorder                 99.930207
CustomValueEstimate         77.956560
WrittenOff                  64.183810
Converted                   64.183810
Rebuilt                     64.183810
NewVehicle                  15.327998
Bank                        14.594670
AccountType                  4.022806
Gender                       0.953507
MaritalStatus                0.825819
mmcode                       0.055195
VehicleType                  0.055195
make                         0.055195
VehicleIntroDate             0.055195
NumberOfDoors                0.055195
bodytype                     0.055195
kilowatts                    0.055195
cubiccapacity                0.055195
Cylinders                    0.055195
Model                        0.055195
CapitalOutstanding           0.000200
dtype: float64

2) Handle missing values

- drop columns with a lot of missing values, i.e greater than 50%

In [12]:
# find the columns with missing values greater than 50%
to_be_droped = column_na_rations[column_na_rations > 0.5].keys()

# drop the columns
data = data.drop(columns=to_be_droped)

- removes data for column with missing values less than 5%

In [13]:
# find the columns with missing values less than 5%
to_be_droped = column_na_rations[column_na_rations < 0.05].keys()

# drop the columns
data = data.dropna(subset=to_be_droped)

- For columns in between use mode or mean(depending on column data type) to replace the missing values

In [21]:
# find the columns with missing values greater than 5% and less than 50%
to_be_replaced = column_na_rations[(column_na_rations >= 0.05) & (column_na_rations <=0.5)].keys()

Find the data type of those columns

In [22]:
data[[*to_be_replaced]].dtypes

Bank          object
NewVehicle    object
dtype: object

Both are categorical so I will use their respective modes

In [24]:
for column in to_be_replaced:
    # Get the mode of the column
    mode_value = data[column].mode()[0]  
    # Fill missing values with the mode
    data[column] = data[column].fillna(mode_value)  

Finally check for the missing values

In [25]:
# find columns with na values
column_na_rations = data.isna().mean()

# print out columns that have missing values and the amount of missing values as percentages
column_na_rations[column_na_rations > 0].sort_values(ascending=False) * 100

Series([], dtype: float64)