In [1]:
# Importing the libraries 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# To run sql queries on DataFrame

import pandasql as psql

In [2]:
# Load the claims data

Claims = pd.read_csv(r"C:\Users\badda\Downloads\Claims_Data_Missing_Values.csv", header = 0)

# Copy to back-up files

Claims_BK1 = Claims.copy()
Claims_BK2 = Claims.copy()

# Display claims data

Claims

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871.0,25.0,Car,0.0,1272.45,12634.25
1,CL-1742889,135517422.0,26.0,Truck,,1052.25,8956.15
2,CL-1384474,176552663.0,19.0,Car,1.0,1389.75,3691.65
3,CL-1283351,157711099.0,35.0,Car,0.0,980.85,
4,CL-1712003,140846711.0,,Car,,1567.45,2934.25
5,CL-1518709,126885444.0,,Van,0.0,1173.95,590.65
6,CL-1661021,191874052.0,61.0,,0.0,,1492.35
7,CL-1622931,184558739.0,38.0,Van,0.0,1146.95,962.15
8,CL-1470986,179027710.0,,Van,1.0,1453.65,2691.95
9,CL-1886922,179500989.0,53.0,Truck,0.0,1252.25,


In [3]:
# Identify the missing values in Claims data

Claims.isnull().sum()

Claim_ID       0
Policy_Num     1
Driver_Age     6
Policy_Type    1
Pre_Claims     4
Premium_AMT    4
Claim_AMT      5
dtype: int64

In [4]:
# %age of each missing variables

for i in Claims.columns:
    print((Claims[i].isnull().sum())/len(Claims)*100)

0.0
4.761904761904762
28.57142857142857
4.761904761904762
19.047619047619047
19.047619047619047
23.809523809523807


In [5]:
# Using KNN Imputer to address the Driver_Age missing values

# KNNImputer(missing_values=np.nan, n_neighbors=5, weights='uniform', metric='nan_euclidean', 
# copy=True, add_indicator=False)

from sklearn.impute import KNNImputer

imputer_knn = KNNImputer(missing_values=np.nan)

# Fill the missing values for 'Driver_Age'

Claims_BK1['Driver_Age'] = imputer_knn.fit_transform(Claims_BK1[['Driver_Age']])

Claims_BK1['Driver_Age'] = Claims_BK1['Driver_Age'].astype(int)

Claims_BK1

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871.0,25,Car,0.0,1272.45,12634.25
1,CL-1742889,135517422.0,26,Truck,,1052.25,8956.15
2,CL-1384474,176552663.0,19,Car,1.0,1389.75,3691.65
3,CL-1283351,157711099.0,35,Car,0.0,980.85,
4,CL-1712003,140846711.0,40,Car,,1567.45,2934.25
5,CL-1518709,126885444.0,40,Van,0.0,1173.95,590.65
6,CL-1661021,191874052.0,61,,0.0,,1492.35
7,CL-1622931,184558739.0,38,Van,0.0,1146.95,962.15
8,CL-1470986,179027710.0,40,Van,1.0,1453.65,2691.95
9,CL-1886922,179500989.0,53,Truck,0.0,1252.25,


In [6]:
# Fill the missing values for 'Premium_AMT'

Claims_BK1['Premium_AMT'] = imputer_knn.fit_transform(Claims_BK1[['Premium_AMT']])

Claims_BK1

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871.0,25,Car,0.0,1272.45,12634.25
1,CL-1742889,135517422.0,26,Truck,,1052.25,8956.15
2,CL-1384474,176552663.0,19,Car,1.0,1389.75,3691.65
3,CL-1283351,157711099.0,35,Car,0.0,980.85,
4,CL-1712003,140846711.0,40,Car,,1567.45,2934.25
5,CL-1518709,126885444.0,40,Van,0.0,1173.95,590.65
6,CL-1661021,191874052.0,61,,0.0,1317.179412,1492.35
7,CL-1622931,184558739.0,38,Van,0.0,1146.95,962.15
8,CL-1470986,179027710.0,40,Van,1.0,1453.65,2691.95
9,CL-1886922,179500989.0,53,Truck,0.0,1252.25,


In [7]:
# Fill the missing values for 'Claim_AMT'

Claims_BK1['Claim_AMT'] = imputer_knn.fit_transform(Claims_BK1[['Claim_AMT']])

Claims_BK1

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871.0,25,Car,0.0,1272.45,12634.25
1,CL-1742889,135517422.0,26,Truck,,1052.25,8956.15
2,CL-1384474,176552663.0,19,Car,1.0,1389.75,3691.65
3,CL-1283351,157711099.0,35,Car,0.0,980.85,3145.04375
4,CL-1712003,140846711.0,40,Car,,1567.45,2934.25
5,CL-1518709,126885444.0,40,Van,0.0,1173.95,590.65
6,CL-1661021,191874052.0,61,,0.0,1317.179412,1492.35
7,CL-1622931,184558739.0,38,Van,0.0,1146.95,962.15
8,CL-1470986,179027710.0,40,Van,1.0,1453.65,2691.95
9,CL-1886922,179500989.0,53,Truck,0.0,1252.25,3145.04375


In [8]:
# Fill null values of Policy_Num with actual value '123456789'  

Claims_BK1 = Claims_BK1.fillna({'Policy_Num': 123456789})

Claims_BK1['Policy_Num'] = Claims_BK1['Policy_Num'].astype(int)

Claims_BK1

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871,25,Car,0.0,1272.45,12634.25
1,CL-1742889,135517422,26,Truck,,1052.25,8956.15
2,CL-1384474,176552663,19,Car,1.0,1389.75,3691.65
3,CL-1283351,157711099,35,Car,0.0,980.85,3145.04375
4,CL-1712003,140846711,40,Car,,1567.45,2934.25
5,CL-1518709,126885444,40,Van,0.0,1173.95,590.65
6,CL-1661021,191874052,61,,0.0,1317.179412,1492.35
7,CL-1622931,184558739,38,Van,0.0,1146.95,962.15
8,CL-1470986,179027710,40,Van,1.0,1453.65,2691.95
9,CL-1886922,179500989,53,Truck,0.0,1252.25,3145.04375


In [9]:
# Display the Claims information

Claims_BK1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Claim_ID     21 non-null     object 
 1   Policy_Num   21 non-null     int32  
 2   Driver_Age   21 non-null     int32  
 3   Policy_Type  20 non-null     object 
 4   Pre_Claims   17 non-null     float64
 5   Premium_AMT  21 non-null     float64
 6   Claim_AMT    21 non-null     float64
dtypes: float64(3), int32(2), object(2)
memory usage: 1.1+ KB


In [10]:
# Find the Policy_type by count

Claims_BK1['Policy_Type'].value_counts()

Policy_Type
Car      9
Van      6
Truck    5
Name: count, dtype: int64

In [11]:
# Find the Pre_Claims by count

Claims_BK1['Pre_Claims'].value_counts()

Pre_Claims
0.0    12
1.0     5
Name: count, dtype: int64

In [12]:
# SimpleImputer(missing_values=np.nan, strategy='mean', fill_value=None, verbose=0, copy=True, add_indicator=False)

from sklearn.impute import SimpleImputer

# SimpleImputer(missing_values=nan, strategy='mean', fill_value=None, verbose='deprecated', copy=True, 
# add_indicator=False, keep_empty_features=False)

imputer_si = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

Claims_BK1['Policy_Type'] = imputer_si.fit_transform(Claims_BK1['Policy_Type'].values.reshape(-1,1))[:,0]
Claims_BK1['Pre_Claims'] = imputer_si.fit_transform(Claims_BK1['Pre_Claims'].values.reshape(-1,1))[:,0]

Claims_BK1

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871,25,Car,0.0,1272.45,12634.25
1,CL-1742889,135517422,26,Truck,0.0,1052.25,8956.15
2,CL-1384474,176552663,19,Car,1.0,1389.75,3691.65
3,CL-1283351,157711099,35,Car,0.0,980.85,3145.04375
4,CL-1712003,140846711,40,Car,0.0,1567.45,2934.25
5,CL-1518709,126885444,40,Van,0.0,1173.95,590.65
6,CL-1661021,191874052,61,Car,0.0,1317.179412,1492.35
7,CL-1622931,184558739,38,Van,0.0,1146.95,962.15
8,CL-1470986,179027710,40,Van,1.0,1453.65,2691.95
9,CL-1886922,179500989,53,Truck,0.0,1252.25,3145.04375


In [13]:
# Convert float values to integer values

#Claims_BK1['Policy_Num'] = Claims_BK1['Policy_Num'].astype(int)
Claims_BK1['Pre_Claims'] = Claims_BK1['Pre_Claims'].astype(int)
Claims_BK1

Unnamed: 0,Claim_ID,Policy_Num,Driver_Age,Policy_Type,Pre_Claims,Premium_AMT,Claim_AMT
0,CL-1292034,176751871,25,Car,0,1272.45,12634.25
1,CL-1742889,135517422,26,Truck,0,1052.25,8956.15
2,CL-1384474,176552663,19,Car,1,1389.75,3691.65
3,CL-1283351,157711099,35,Car,0,980.85,3145.04375
4,CL-1712003,140846711,40,Car,0,1567.45,2934.25
5,CL-1518709,126885444,40,Van,0,1173.95,590.65
6,CL-1661021,191874052,61,Car,0,1317.179412,1492.35
7,CL-1622931,184558739,38,Van,0,1146.95,962.15
8,CL-1470986,179027710,40,Van,1,1453.65,2691.95
9,CL-1886922,179500989,53,Truck,0,1252.25,3145.04375


# new problem

In [14]:
# Importing the libraries 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# Import psql to run queries 

import pandasql as psql

In [15]:
# Load the weather dataset

weather = pd.read_csv(r"C:\Users\badda\Downloads\weather.csv", header=0)

# Copy to back-up file

weather_bk = weather.copy()

# Display first 5 records

weather

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,8.0,24.3,0.0,3.4,6.3,NW,30.0,SW,NW,6.0,20,68,29,1019.7,1015.0,7,7,14.4,23.6,No,3.6,Yes
1,14.0,26.9,3.6,4.4,9.7,ENE,39.0,E,W,4.0,17,80,36,1012.4,1008.4,5,3,17.5,25.7,Yes,3.6,Yes
2,13.7,23.4,3.6,5.8,3.3,NW,85.0,N,NNE,6.0,6,82,69,1009.5,1007.2,8,7,15.4,20.2,Yes,39.8,Yes
3,13.3,15.5,39.8,7.2,9.1,NW,54.0,WNW,W,30.0,24,62,56,1005.5,1007.0,2,7,13.5,14.1,Yes,2.8,Yes
4,7.6,16.1,2.8,5.6,10.6,SSE,50.0,SSE,ESE,20.0,28,68,49,1018.3,1018.5,7,7,11.1,15.4,Yes,0.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,9.0,30.7,0.0,7.6,12.1,NNW,76.0,SSE,NW,7.0,50,38,15,1016.1,1010.8,1,3,20.4,30.0,No,0.0,No
362,7.1,28.4,0.0,11.6,12.7,N,48.0,NNW,NNW,2.0,19,45,22,1020.0,1016.9,0,1,17.2,28.2,No,0.0,No
363,12.5,19.9,0.0,8.4,5.3,ESE,43.0,ENE,ENE,11.0,9,63,47,1024.0,1022.8,3,2,14.5,18.3,No,0.0,No
364,12.5,26.9,0.0,5.0,7.1,NW,46.0,SSW,WNW,6.0,28,69,39,1021.0,1016.2,6,7,15.8,25.9,No,0.0,No


In [16]:
# Display the dataset information

weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MinTemp        366 non-null    float64
 1   MaxTemp        366 non-null    float64
 2   Rainfall       366 non-null    float64
 3   Evaporation    366 non-null    float64
 4   Sunshine       363 non-null    float64
 5   WindGustDir    363 non-null    object 
 6   WindGustSpeed  364 non-null    float64
 7   WindDir9am     335 non-null    object 
 8   WindDir3pm     365 non-null    object 
 9   WindSpeed9am   359 non-null    float64
 10  WindSpeed3pm   366 non-null    int64  
 11  Humidity9am    366 non-null    int64  
 12  Humidity3pm    366 non-null    int64  
 13  Pressure9am    366 non-null    float64
 14  Pressure3pm    366 non-null    float64
 15  Cloud9am       366 non-null    int64  
 16  Cloud3pm       366 non-null    int64  
 17  Temp9am        366 non-null    float64
 18  Temp3pm   

In [17]:
# Identify the missing data in all variables

weather.isnull().sum()

MinTemp           0
MaxTemp           0
Rainfall          0
Evaporation       0
Sunshine          3
WindGustDir       3
WindGustSpeed     2
WindDir9am       31
WindDir3pm        1
WindSpeed9am      7
WindSpeed3pm      0
Humidity9am       0
Humidity3pm       0
Pressure9am       0
Pressure3pm       0
Cloud9am          0
Cloud3pm          0
Temp9am           0
Temp3pm           0
RainToday         0
RISK_MM           0
RainTomorrow      0
dtype: int64

In [18]:
# Use KNNImputer to address missing values

from sklearn.impute import KNNImputer

imputer_int = KNNImputer(missing_values=np.nan, n_neighbors=5, weights='uniform', metric='nan_euclidean',
                         copy=True, add_indicator=False)

weather['Sunshine'] = imputer_int.fit_transform(weather[['Sunshine']])

In [20]:
# Use SimpleImputer to address missing values

from sklearn.impute import SimpleImputer

imputer_str = SimpleImputer(missing_values=np.nan, strategy='most_frequent', fill_value=None,
                            copy=True, add_indicator=False)

weather['WindGustDir'] = imputer_str.fit_transform(weather['WindGustDir'].values.reshape(-1,1))[:,0]
weather['WindGustSpeed'] = imputer_int.fit_transform(weather['WindGustSpeed'].values.reshape(-1,1))[:,0]
weather['WindDir9am'] = imputer_str.fit_transform(weather['WindDir9am'].values.reshape(-1,1))[:,0]
weather['WindDir3pm'] = imputer_str.fit_transform(weather['WindDir3pm'].values.reshape(-1,1))[:,0]
weather['WindSpeed9am'] = imputer_int.fit_transform(weather['WindSpeed9am'].values.reshape(-1,1))[:,0]