In [1]:
# Importing the libraries 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# To run sql queries on DataFrame

import pandasql as psql

In [6]:
# Load the claims data

Claims = pd.read_csv(r"C:\Users\badda\Downloads\Claims_Data_Missing_Values.csv",header = 0)

# Display claims data

Claims

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871.0,25.0,Car,0.0,1272.45,12634.25
1,CL-1742889,135517422.0,26.0,Truck,,1052.25,8956.15
2,CL-1384474,176552663.0,19.0,Car,1.0,1389.75,3691.65
3,CL-1283351,157711099.0,35.0,Car,0.0,980.85,
4,CL-1712003,140846711.0,,Car,,1567.45,2934.25
5,CL-1518709,126885444.0,,Van,0.0,1173.95,590.65
6,CL-1661021,191874052.0,61.0,,0.0,,1492.35
7,CL-1622931,184558739.0,38.0,Van,0.0,1146.95,962.15
8,CL-1470986,179027710.0,,Van,1.0,1453.65,2691.95
9,CL-1886922,179500989.0,53.0,Truck,0.0,1252.25,


In [7]:
# Display the dataset information

Claims.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Claim_ID     21 non-null     object 
 1   Policy_Num   20 non-null     float64
 2   Driver_Age   15 non-null     float64
 3   Policy_Type  20 non-null     object 
 4   Pre_Claims   17 non-null     float64
 5   Premium_AMT  17 non-null     float64
 6   Claim_AMT    16 non-null     float64
dtypes: float64(5), object(2)
memory usage: 1.3+ KB


In [8]:
# Display the data type of variables

Claims.dtypes

Claim_ID        object
Policy_Num     float64
Driver_Age     float64
Policy_Type     object
Pre_Claims     float64
Premium_AMT    float64
Claim_AMT      float64
dtype: object

In [9]:
# Display the null values in the dataset by count in each variable

Claims.isnull().sum()

Claim_ID       0
Policy_Num     1
Driver_Age     6
Policy_Type    1
Pre_Claims     4
Premium_AMT    4
Claim_AMT      5
dtype: int64

In [10]:
# Identify the numerical and categorical variables

num_vars = Claims.columns[Claims.dtypes != 'object']
cat_vars = Claims.columns[Claims.dtypes == 'object']
print(num_vars)
print(cat_vars)

Index(['Policy_Num', 'Driver_Age', 'Pre_Claims', 'Premium_AMT', 'Claim_AMT'], dtype='object')
Index(['Claim_ID', 'Policy_Type'], dtype='object')


# Create New File - Claims_01

In [12]:
# Create new file with filled '0' of all null values

Claims_01 = Claims.fillna(0)

# Display claims data

Claims_01

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871.0,25.0,Car,0.0,1272.45,12634.25
1,CL-1742889,135517422.0,26.0,Truck,0.0,1052.25,8956.15
2,CL-1384474,176552663.0,19.0,Car,1.0,1389.75,3691.65
3,CL-1283351,157711099.0,35.0,Car,0.0,980.85,0.0
4,CL-1712003,140846711.0,0.0,Car,0.0,1567.45,2934.25
5,CL-1518709,126885444.0,0.0,Van,0.0,1173.95,590.65
6,CL-1661021,191874052.0,61.0,0,0.0,0.0,1492.35
7,CL-1622931,184558739.0,38.0,Van,0.0,1146.95,962.15
8,CL-1470986,179027710.0,0.0,Van,1.0,1453.65,2691.95
9,CL-1886922,179500989.0,53.0,Truck,0.0,1252.25,0.0


In [13]:
# Convert float to integer value of 'Driver_Age' and 'Policy_Num'

Claims_01['Driver_Age'] = Claims_01['Driver_Age'].astype(int)
Claims_01['Policy_Num'] = Claims_01['Policy_Num'].astype(int)
Claims_01['Pre_Claims'] = Claims_01['Pre_Claims'].astype(int)

# Display claims data

Claims_01

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871,25,Car,0,1272.45,12634.25
1,CL-1742889,135517422,26,Truck,0,1052.25,8956.15
2,CL-1384474,176552663,19,Car,1,1389.75,3691.65
3,CL-1283351,157711099,35,Car,0,980.85,0.0
4,CL-1712003,140846711,0,Car,0,1567.45,2934.25
5,CL-1518709,126885444,0,Van,0,1173.95,590.65
6,CL-1661021,191874052,61,0,0,0.0,1492.35
7,CL-1622931,184558739,38,Van,0,1146.95,962.15
8,CL-1470986,179027710,0,Van,1,1453.65,2691.95
9,CL-1886922,179500989,53,Truck,0,1252.25,0.0


In [14]:
# Display data information after mutation technique


Claims_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Claim_ID     21 non-null     object 
 1   Policy_Num   21 non-null     int32  
 2   Driver_Age   21 non-null     int32  
 3   Policy_Type  21 non-null     object 
 4   Pre_Claims   21 non-null     int32  
 5   Premium_AMT  21 non-null     float64
 6   Claim_AMT    21 non-null     float64
dtypes: float64(2), int32(3), object(2)
memory usage: 1.0+ KB


# Create New File - Claims_02

In [15]:
# Fill null values of each variable as calculated 

Claims_02 = Claims.fillna({'Driver_Age': 40,
                           'Policy_Num': 123456789,
                           'Policy_Type': 'Car', 
                           'Pre_Claims': 0, 
                           'Premium_AMT': 0, 
                           'Claim_AMT': 0})

# Display claims data

Claims_02

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871.0,25.0,Car,0.0,1272.45,12634.25
1,CL-1742889,135517422.0,26.0,Truck,0.0,1052.25,8956.15
2,CL-1384474,176552663.0,19.0,Car,1.0,1389.75,3691.65
3,CL-1283351,157711099.0,35.0,Car,0.0,980.85,0.0
4,CL-1712003,140846711.0,40.0,Car,0.0,1567.45,2934.25
5,CL-1518709,126885444.0,40.0,Van,0.0,1173.95,590.65
6,CL-1661021,191874052.0,61.0,Car,0.0,0.0,1492.35
7,CL-1622931,184558739.0,38.0,Van,0.0,1146.95,962.15
8,CL-1470986,179027710.0,40.0,Van,1.0,1453.65,2691.95
9,CL-1886922,179500989.0,53.0,Truck,0.0,1252.25,0.0


In [16]:
# Convert float to integer value of 'Driver_Age' and 'Policy_Num'

Claims_02['Driver_Age'] = Claims_02['Driver_Age'].astype(int)
Claims_02['Policy_Num'] = Claims_02['Policy_Num'].astype(int)
Claims_02['Pre_Claims'] = Claims_02['Pre_Claims'].astype(int)

# Display the Claims data

Claims_02

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871,25,Car,0,1272.45,12634.25
1,CL-1742889,135517422,26,Truck,0,1052.25,8956.15
2,CL-1384474,176552663,19,Car,1,1389.75,3691.65
3,CL-1283351,157711099,35,Car,0,980.85,0.0
4,CL-1712003,140846711,40,Car,0,1567.45,2934.25
5,CL-1518709,126885444,40,Van,0,1173.95,590.65
6,CL-1661021,191874052,61,Car,0,0.0,1492.35
7,CL-1622931,184558739,38,Van,0,1146.95,962.15
8,CL-1470986,179027710,40,Van,1,1453.65,2691.95
9,CL-1886922,179500989,53,Truck,0,1252.25,0.0


# Missing Values with Forward Fill

In [17]:
# Missing Values with Forward Fill

Claims_03 = Claims.fillna(method="ffill")

# Display claims data

Claims_03

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871.0,25.0,Car,0.0,1272.45,12634.25
1,CL-1742889,135517422.0,26.0,Truck,0.0,1052.25,8956.15
2,CL-1384474,176552663.0,19.0,Car,1.0,1389.75,3691.65
3,CL-1283351,157711099.0,35.0,Car,0.0,980.85,3691.65
4,CL-1712003,140846711.0,35.0,Car,0.0,1567.45,2934.25
5,CL-1518709,126885444.0,35.0,Van,0.0,1173.95,590.65
6,CL-1661021,191874052.0,61.0,Van,0.0,1173.95,1492.35
7,CL-1622931,184558739.0,38.0,Van,0.0,1146.95,962.15
8,CL-1470986,179027710.0,38.0,Van,1.0,1453.65,2691.95
9,CL-1886922,179500989.0,53.0,Truck,0.0,1252.25,2691.95


In [19]:
# Forward Fill with limit option 

Claims_04 = Claims.fillna(method="ffill", limit=1)

# Display Claims data

Claims_04


Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871.0,25.0,Car,0.0,1272.45,12634.25
1,CL-1742889,135517422.0,26.0,Truck,0.0,1052.25,8956.15
2,CL-1384474,176552663.0,19.0,Car,1.0,1389.75,3691.65
3,CL-1283351,157711099.0,35.0,Car,0.0,980.85,3691.65
4,CL-1712003,140846711.0,35.0,Car,0.0,1567.45,2934.25
5,CL-1518709,126885444.0,,Van,0.0,1173.95,590.65
6,CL-1661021,191874052.0,61.0,Van,0.0,1173.95,1492.35
7,CL-1622931,184558739.0,38.0,Van,0.0,1146.95,962.15
8,CL-1470986,179027710.0,38.0,Van,1.0,1453.65,2691.95
9,CL-1886922,179500989.0,53.0,Truck,0.0,1252.25,2691.95


# Missing Values with Backward Fill

In [20]:
# Missing Values with Backward Fill

Claims_05 = Claims.fillna(method="bfill")

# Display Claims data

Claims_05

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871.0,25.0,Car,0.0,1272.45,12634.25
1,CL-1742889,135517422.0,26.0,Truck,1.0,1052.25,8956.15
2,CL-1384474,176552663.0,19.0,Car,1.0,1389.75,3691.65
3,CL-1283351,157711099.0,35.0,Car,0.0,980.85,2934.25
4,CL-1712003,140846711.0,61.0,Car,0.0,1567.45,2934.25
5,CL-1518709,126885444.0,61.0,Van,0.0,1173.95,590.65
6,CL-1661021,191874052.0,61.0,Van,0.0,1146.95,1492.35
7,CL-1622931,184558739.0,38.0,Van,0.0,1146.95,962.15
8,CL-1470986,179027710.0,53.0,Van,1.0,1453.65,2691.95
9,CL-1886922,179500989.0,53.0,Truck,0.0,1252.25,6924.15


In [21]:
# Backward Fill with limit option 

Claims_06 = Claims.fillna(method="bfill", limit=1)

# Display Claims Data

Claims_06

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871.0,25.0,Car,0.0,1272.45,12634.25
1,CL-1742889,135517422.0,26.0,Truck,1.0,1052.25,8956.15
2,CL-1384474,176552663.0,19.0,Car,1.0,1389.75,3691.65
3,CL-1283351,157711099.0,35.0,Car,0.0,980.85,2934.25
4,CL-1712003,140846711.0,,Car,0.0,1567.45,2934.25
5,CL-1518709,126885444.0,61.0,Van,0.0,1173.95,590.65
6,CL-1661021,191874052.0,61.0,Van,0.0,1146.95,1492.35
7,CL-1622931,184558739.0,38.0,Van,0.0,1146.95,962.15
8,CL-1470986,179027710.0,53.0,Van,1.0,1453.65,2691.95
9,CL-1886922,179500989.0,53.0,Truck,0.0,1252.25,6924.15


In [22]:
# Filling Missing Values with "interpolate" option

In [23]:
# Use interpolate option

Claims_07 = Claims.interpolate(method ='linear', limit_direction ='forward')

# Display claims data

Claims_07

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871.0,25.0,Car,0.0,1272.45,12634.25
1,CL-1742889,135517422.0,26.0,Truck,0.5,1052.25,8956.15
2,CL-1384474,176552663.0,19.0,Car,1.0,1389.75,3691.65
3,CL-1283351,157711099.0,35.0,Car,0.0,980.85,3312.95
4,CL-1712003,140846711.0,43.666667,Car,0.0,1567.45,2934.25
5,CL-1518709,126885444.0,52.333333,Van,0.0,1173.95,590.65
6,CL-1661021,191874052.0,61.0,,0.0,1160.45,1492.35
7,CL-1622931,184558739.0,38.0,Van,0.0,1146.95,962.15
8,CL-1470986,179027710.0,45.5,Van,1.0,1453.65,2691.95
9,CL-1886922,179500989.0,53.0,Truck,0.0,1252.25,4808.05


In [24]:
# Use interpolate option

Claims_07 = Claims.interpolate(method ='linear', limit_direction ='forward')

# Display claims data

Claims_07

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871.0,25.0,Car,0.0,1272.45,12634.25
1,CL-1742889,135517422.0,26.0,Truck,0.5,1052.25,8956.15
2,CL-1384474,176552663.0,19.0,Car,1.0,1389.75,3691.65
3,CL-1283351,157711099.0,35.0,Car,0.0,980.85,3312.95
4,CL-1712003,140846711.0,43.666667,Car,0.0,1567.45,2934.25
5,CL-1518709,126885444.0,52.333333,Van,0.0,1173.95,590.65
6,CL-1661021,191874052.0,61.0,,0.0,1160.45,1492.35
7,CL-1622931,184558739.0,38.0,Van,0.0,1146.95,962.15
8,CL-1470986,179027710.0,45.5,Van,1.0,1453.65,2691.95
9,CL-1886922,179500989.0,53.0,Truck,0.0,1252.25,4808.05


In [25]:
# Use interpolate option

Claims_07 = Claims.interpolate(method ='linear', limit_direction ='forward')

# Display claims data

Claims_07

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871.0,25.0,Car,0.0,1272.45,12634.25
1,CL-1742889,135517422.0,26.0,Truck,0.5,1052.25,8956.15
2,CL-1384474,176552663.0,19.0,Car,1.0,1389.75,3691.65
3,CL-1283351,157711099.0,35.0,Car,0.0,980.85,3312.95
4,CL-1712003,140846711.0,43.666667,Car,0.0,1567.45,2934.25
5,CL-1518709,126885444.0,52.333333,Van,0.0,1173.95,590.65
6,CL-1661021,191874052.0,61.0,,0.0,1160.45,1492.35
7,CL-1622931,184558739.0,38.0,Van,0.0,1146.95,962.15
8,CL-1470986,179027710.0,45.5,Van,1.0,1453.65,2691.95
9,CL-1886922,179500989.0,53.0,Truck,0.0,1252.25,4808.05


In [26]:
# Drop all null values in rows / columns 

Claims_08 = Claims.dropna()

# Display claims data

Claims_08

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871.0,25.0,Car,0.0,1272.45,12634.25
2,CL-1384474,176552663.0,19.0,Car,1.0,1389.75,3691.65
7,CL-1622931,184558739.0,38.0,Van,0.0,1146.95,962.15
10,CL-1727554,176634765.0,60.0,Car,1.0,1584.75,6924.15
12,CL-1927305,132014482.0,34.0,Car,0.0,1135.45,350.95
14,CL-1525750,130465403.0,36.0,Car,0.0,1123.25,264.75
20,CL-1893203,161682966.0,65.0,Car,0.0,1425.65,736.95


In [27]:
# Drop all null value of columns

Claims_09 = Claims.dropna(axis = 1, how ='any')

# Display claims data

Claims_09

Unnamed: 0,Claim_ID
0,CL-1292034
1,CL-1742889
2,CL-1384474
3,CL-1283351
4,CL-1712003
5,CL-1518709
6,CL-1661021
7,CL-1622931
8,CL-1470986
9,CL-1886922


In [28]:
# Drop all null value of rows

Claims_10 = Claims.dropna(axis = 0, how ='any')

# Display claims data

Claims_10

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871.0,25.0,Car,0.0,1272.45,12634.25
2,CL-1384474,176552663.0,19.0,Car,1.0,1389.75,3691.65
7,CL-1622931,184558739.0,38.0,Van,0.0,1146.95,962.15
10,CL-1727554,176634765.0,60.0,Car,1.0,1584.75,6924.15
12,CL-1927305,132014482.0,34.0,Car,0.0,1135.45,350.95
14,CL-1525750,130465403.0,36.0,Car,0.0,1123.25,264.75
20,CL-1893203,161682966.0,65.0,Car,0.0,1425.65,736.95


In [29]:
# Fill null values of each variable as calculated 

Claims_11 = Claims.fillna({'Driver_Age': 29,
                           'Policy_Type': 'Car',
                           'Pre_Claims': 0,
                           'Premium_AMT': 1272.45,
                           'Claim_AMT': 1227.25,
                           'Policy_Num': 123456789})

Claims_11

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871.0,25.0,Car,0.0,1272.45,12634.25
1,CL-1742889,135517422.0,26.0,Truck,0.0,1052.25,8956.15
2,CL-1384474,176552663.0,19.0,Car,1.0,1389.75,3691.65
3,CL-1283351,157711099.0,35.0,Car,0.0,980.85,1227.25
4,CL-1712003,140846711.0,29.0,Car,0.0,1567.45,2934.25
5,CL-1518709,126885444.0,29.0,Van,0.0,1173.95,590.65
6,CL-1661021,191874052.0,61.0,Car,0.0,1272.45,1492.35
7,CL-1622931,184558739.0,38.0,Van,0.0,1146.95,962.15
8,CL-1470986,179027710.0,29.0,Van,1.0,1453.65,2691.95
9,CL-1886922,179500989.0,53.0,Truck,0.0,1252.25,1227.25


In [30]:

# Convert float to integer value of 'Driver_Age' and 'Policy_Num'

Claims_11['Driver_Age'] = Claims_11['Driver_Age'].astype(int)
Claims_11['Policy_Num'] = Claims_11['Policy_Num'].astype(int)
Claims_11['Pre_Claims'] = Claims_11['Pre_Claims'].astype(int)

# Display claims data

Claims_11


Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871,25,Car,0,1272.45,12634.25
1,CL-1742889,135517422,26,Truck,0,1052.25,8956.15
2,CL-1384474,176552663,19,Car,1,1389.75,3691.65
3,CL-1283351,157711099,35,Car,0,980.85,1227.25
4,CL-1712003,140846711,29,Car,0,1567.45,2934.25
5,CL-1518709,126885444,29,Van,0,1173.95,590.65
6,CL-1661021,191874052,61,Car,0,1272.45,1492.35
7,CL-1622931,184558739,38,Van,0,1146.95,962.15
8,CL-1470986,179027710,29,Van,1,1453.65,2691.95
9,CL-1886922,179500989,53,Truck,0,1252.25,1227.25


# Addressing Missing Values - Mutation techniques