## Importing Libraries

In [2]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
sns.set_style("whitegrid")

## Importing the dataset

In [3]:
## This data is for validation of model
val_data = pd.read_csv('ClaimHistory_validation.csv')
val_data.head()

Unnamed: 0,sys_sector,sys_process,sys_product,sys_dataspecification_version,sys_claimid,sys_currency_code,claim_amount_claimed_total,claim_causetype,claim_date_occurred,claim_date_reported,claim_location_urban_area,object_make,object_year_construction,ph_firstname,ph_gender,ph_name,policy_fleet_flag,policy_insured_amount,policy_profitability,sys_fraud
0,Private NonLife,Claims_initial_load,MOTOR,4.5,MTR-666202639-02,EUR,2627.0,Collision,20150521,20160916,0,OTHER,2014.0,Shishan,F,Lor,0,114718.0,Very low,0
1,Private NonLife,Claims_initial_load,MOTOR,4.5,MTR-258901077-02,EUR,269.0,Other,20130302,20130304,0,AUDI,1999.0,Dominic,M,Purnell,0,117522.0,Neutral,0
2,Private NonLife,Claims_initial_load,MOTOR,4.5,MTR-547929317-02,EUR,157.0,Collision,20121030,20121127,1,RENAULT,2009.0,Katalina,F,Shea,0,72287.0,Low,0
3,Private NonLife,Claims_initial_load,MOTOR,4.5,MTR-745856657-02,EUR,299.0,Collision,20150824,20150826,0,BMW,2008.0,Ebonee,F,Ryan,0,112630.0,Low,0
4,Private NonLife,Claims_initial_load,MOTOR,4.5,MTR-686506389-02,EUR,211.0,Theft,20141126,20141226,0,OTHER,2000.0,Emonie,F,Hester,0,104030.0,Neutral,0


In [4]:
val_data.shape

(20000, 20)

## Checking the data with the same metrics that was used in the train dataset

In [5]:
# correcting the date of "claim_date_reported" column
# Let check if there are all correct values in date columns
print("start of claim_date_occurred :",val_data.claim_date_occurred.min())
print("end of claim_date_occurred :",val_data.claim_date_occurred.max())
print("start of claim_date_reported :",val_data.claim_date_reported.min())
print("end of claim_date_reported :",val_data.claim_date_reported.max())

start of claim_date_occurred : 20120701
end of claim_date_occurred : 20180208
start of claim_date_reported : 20120701
end of claim_date_reported : 20220829


In [6]:


# making maximum possible date as '20220829' for this analysis as claim can't be raise in future dates
val_data['claim_date_reported'] = pd.to_datetime(val_data.claim_date_reported.astype(str), format='%Y%m%d')
val_data['claim_date_reported_new'] =  pd.to_datetime('20220829')
val_data['claim_date_reported_new']= np.where( val_data.claim_date_reported >pd.to_datetime('20220829'), val_data['claim_date_reported_new'],val_data['claim_date_reported'])

#calculating difference between reported and occurance date
val_data['diff_days_between_claim_occ_report']= (val_data.claim_date_reported_new  - pd.to_datetime(val_data.claim_date_occurred.astype(str), format='%Y%m%d'))/np.timedelta64(1, 'D')



In [7]:
val_data['diff_days_between_claim_occ_report'].describe()

count    20000.000000
mean        19.518950
std         66.455858
min          0.000000
25%          2.000000
50%          4.000000
75%         14.000000
max       2828.000000
Name: diff_days_between_claim_occ_report, dtype: float64

In [8]:
#calculating new year of construction
val_data['object_year_construction_new']= np.where( val_data.object_year_construction >pd.to_datetime(val_data.claim_date_occurred.astype(str), format='%Y%m%d').dt.year,pd.to_datetime(val_data.claim_date_occurred.astype(str), format='%Y%m%d').dt.year,val_data['object_year_construction'])
val_data[val_data.object_year_construction >pd.to_datetime(val_data.claim_date_occurred.astype(str), format='%Y%m%d').dt.year][['claim_date_occurred','object_year_construction',"object_year_construction_new"]]



Unnamed: 0,claim_date_occurred,object_year_construction,object_year_construction_new
835,20150221,2016.0,2015.0
1387,20151012,2016.0,2015.0
1664,20150214,2016.0,2015.0
1929,20150618,2016.0,2015.0
2052,20150528,2019.0,2015.0
...,...,...,...
19871,20130323,2015.0,2013.0
19874,20130107,2015.0,2013.0
19879,20130512,2014.0,2013.0
19937,20120904,2015.0,2012.0


In [9]:
#checking if policy_insured_amount and claim_amount_claimed_total <=0.
val_data[val_data.policy_insured_amount<=0]

Unnamed: 0,sys_sector,sys_process,sys_product,sys_dataspecification_version,sys_claimid,sys_currency_code,claim_amount_claimed_total,claim_causetype,claim_date_occurred,claim_date_reported,...,ph_firstname,ph_gender,ph_name,policy_fleet_flag,policy_insured_amount,policy_profitability,sys_fraud,claim_date_reported_new,diff_days_between_claim_occ_report,object_year_construction_new
493,Private NonLife,Claims_initial_load,MOTOR,4.5,MTR-865850951-02,EUR,642.0,Collision,20130312,2013-03-22,...,Miah,F,Kang,0,-7490.0,Neutral,0,2013-03-22,10.0,2001.0


In [10]:
# here it seems like a data entry problem as it is negative. As it is a single case lets make it +ve
val_data.policy_insured_amount = abs(val_data.policy_insured_amount)
val_data[val_data.policy_insured_amount<=0]

Unnamed: 0,sys_sector,sys_process,sys_product,sys_dataspecification_version,sys_claimid,sys_currency_code,claim_amount_claimed_total,claim_causetype,claim_date_occurred,claim_date_reported,...,ph_firstname,ph_gender,ph_name,policy_fleet_flag,policy_insured_amount,policy_profitability,sys_fraud,claim_date_reported_new,diff_days_between_claim_occ_report,object_year_construction_new


In [11]:
val_data[val_data.claim_amount_claimed_total<=0]

Unnamed: 0,sys_sector,sys_process,sys_product,sys_dataspecification_version,sys_claimid,sys_currency_code,claim_amount_claimed_total,claim_causetype,claim_date_occurred,claim_date_reported,...,ph_firstname,ph_gender,ph_name,policy_fleet_flag,policy_insured_amount,policy_profitability,sys_fraud,claim_date_reported_new,diff_days_between_claim_occ_report,object_year_construction_new


In [12]:
# checking if claim_amount_claimed_total<policy_insured_amount
val_data['claim_amount_claimed_total_new']= np.where( val_data.claim_amount_claimed_total >val_data.policy_insured_amount,val_data.policy_insured_amount,val_data['claim_amount_claimed_total'])
#checking if the correction worked
val_data[val_data.claim_amount_claimed_total >val_data.policy_insured_amount][["claim_amount_claimed_total","policy_insured_amount","claim_amount_claimed_total_new"]]



Unnamed: 0,claim_amount_claimed_total,policy_insured_amount,claim_amount_claimed_total_new
1589,60994.0,53967.0,53967.0
1599,64047.0,50042.0,50042.0
1990,112591.0,58383.0,58383.0
2230,245426.0,48196.0,48196.0
3664,165633.0,69847.0,69847.0
...,...,...,...
18571,68575.0,51006.0,51006.0
18628,84944.0,50157.0,50157.0
19365,77585.0,61487.0,61487.0
19671,45473.0,38759.0,38759.0


In [13]:
#Removing duplicate claim ids; keeping the claim id which claim occured first
val_data.sort_values("claim_date_occurred", inplace = True) 
print("number of rows in val_data before removing duplicate ids:", val_data.shape[0])
val_data.drop_duplicates(['sys_claimid'], keep = 'first', inplace = True) 
print("number of rows in val_data after removing duplicate ids:", val_data.shape[0])
#no duplicates

number of rows in val_data before removing duplicate ids: 20000
number of rows in val_data after removing duplicate ids: 20000


In [14]:
# Feature engineering
val_data['Age_of_vehicle']= (pd.to_datetime(val_data.claim_date_occurred.astype(str), format='%Y%m%d').dt.year)-(val_data.object_year_construction_new)
val_data.head()

Unnamed: 0,sys_sector,sys_process,sys_product,sys_dataspecification_version,sys_claimid,sys_currency_code,claim_amount_claimed_total,claim_causetype,claim_date_occurred,claim_date_reported,...,ph_name,policy_fleet_flag,policy_insured_amount,policy_profitability,sys_fraud,claim_date_reported_new,diff_days_between_claim_occ_report,object_year_construction_new,claim_amount_claimed_total_new,Age_of_vehicle
4493,Private NonLife,Claims_initial_load,MOTOR,4.5,MTR-845197269-02,EUR,9388.0,Animals,20120701,2012-07-07,...,Baker,1,43777.0,High,0,2012-07-07,6.0,2008.0,9388.0,4.0
708,Private NonLife,Claims_initial_load,MOTOR,4.5,MTR-277591109-02,EUR,360.0,Collision,20120701,2012-07-07,...,Newcomb,0,,High,0,2012-07-07,6.0,2007.0,360.0,5.0
18898,Private NonLife,Claims_initial_load,MOTOR,4.5,MTR-783277250-02,EUR,9349.0,Collision,20120701,2012-07-01,...,Thapa,0,59590.0,Low,0,2012-07-01,0.0,2012.0,9349.0,0.0
4646,Private NonLife,Claims_initial_load,MOTOR,4.5,MTR-128255363-02,EUR,4063.0,Collision,20120701,2012-07-03,...,Graham,1,50808.0,Very high,0,2012-07-03,2.0,2011.0,4063.0,1.0
14657,Private NonLife,Claims_initial_load,MOTOR,4.5,MTR-251228076-02,EUR,79.0,Collision,20120701,2012-07-02,...,Ortega,0,,Very high,0,2012-07-02,1.0,2012.0,79.0,0.0


In [15]:
### 3) Time since claim reported as of 20220829(in years)
val_data['time_since_claim']= (pd.to_datetime('20220829')  - pd.to_datetime(val_data.claim_date_occurred.astype(str), format='%Y%m%d'))/np.timedelta64(1, 'Y')

In [16]:
#creating target/label column
val_data.rename(columns={'sys_fraud': 'label'}, inplace=True)

In [17]:
#Checking fraud rate in OOT set
# we can see that fraud rate(0.68%) is lesser in oot dataset as in train(0.76%)
val_data.label.value_counts()/len(val_data)

0    0.9932
1    0.0068
Name: label, dtype: float64

In [18]:
#dropping unnecessary columns
val_data.columns

Index(['sys_sector', 'sys_process', 'sys_product',
       'sys_dataspecification_version', 'sys_claimid', 'sys_currency_code',
       'claim_amount_claimed_total', 'claim_causetype', 'claim_date_occurred',
       'claim_date_reported', 'claim_location_urban_area', 'object_make',
       'object_year_construction', 'ph_firstname', 'ph_gender', 'ph_name',
       'policy_fleet_flag', 'policy_insured_amount', 'policy_profitability',
       'label', 'claim_date_reported_new',
       'diff_days_between_claim_occ_report', 'object_year_construction_new',
       'claim_amount_claimed_total_new', 'Age_of_vehicle', 'time_since_claim'],
      dtype='object')

In [19]:
drop_col = ['sys_sector', 'sys_process', 'sys_product',
       'sys_dataspecification_version', 'sys_currency_code',
       'claim_amount_claimed_total', 'claim_date_occurred', 'claim_date_reported','object_year_construction',
       'claim_date_reported_new','object_year_construction_new']
val_data.drop(columns=drop_col, axis = 1,inplace=True)
val_data.shape

(20000, 15)

In [20]:
val_data.head()

Unnamed: 0,sys_claimid,claim_causetype,claim_location_urban_area,object_make,ph_firstname,ph_gender,ph_name,policy_fleet_flag,policy_insured_amount,policy_profitability,label,diff_days_between_claim_occ_report,claim_amount_claimed_total_new,Age_of_vehicle,time_since_claim
4493,MTR-845197269-02,Animals,0,VOLKSWAGEN,Simon,M,Baker,1,43777.0,High,0,6.0,9388.0,4.0,10.160373
708,MTR-277591109-02,Collision,0,AUDI,Melissa,L,Newcomb,0,,High,0,6.0,360.0,5.0,10.160373
18898,MTR-783277250-02,Collision,0,OTHER,Kathy,F,Thapa,0,59590.0,Low,0,0.0,9349.0,0.0,10.160373
4646,MTR-128255363-02,Collision,1,OPEL,Christian,L,Graham,1,50808.0,Very high,0,2.0,4063.0,1.0,10.160373
14657,MTR-251228076-02,Collision,0,VOLKSWAGEN,Jonathan,M,Ortega,0,,Very high,0,1.0,79.0,0.0,10.160373


In [21]:
#checking missing percentage in oot_data
percent_missing = val_data.isnull().sum() * 100 / len(val_data)
percent_missing

sys_claimid                            0.000
claim_causetype                        0.010
claim_location_urban_area              0.000
object_make                            0.000
ph_firstname                           0.000
ph_gender                              0.970
ph_name                                0.000
policy_fleet_flag                      0.000
policy_insured_amount                 38.515
policy_profitability                   0.000
label                                  0.000
diff_days_between_claim_occ_report     0.000
claim_amount_claimed_total_new         0.000
Age_of_vehicle                         0.000
time_since_claim                       0.000
dtype: float64

In [22]:
val_data.shape

(20000, 15)

In [24]:
val_data['ph_gender'] = val_data['ph_gender'].fillna('missing')

In [25]:
val_data['ph_gender'].value_counts()

M          7042
F          6988
L          5776
missing     194
Name: ph_gender, dtype: int64

In [27]:
# Converting Null and L into 'other' column
val_data['ph_gender'] = val_data['ph_gender'].apply(lambda x : 'other' if x == 'L' or x == 'missing' else x)

In [28]:
val_data['ph_gender'].value_counts()

M        7042
F        6988
other    5970
Name: ph_gender, dtype: int64

In [30]:
val_data['claim_causetype'].value_counts()

Collision    12324
Other         3860
Weather       1899
Animals       1112
Theft          803
Name: claim_causetype, dtype: int64

In [31]:
val_data['claim_causetype'] = val_data['claim_causetype'].fillna('missing')

In [32]:
val_data.columns

Index(['sys_claimid', 'claim_causetype', 'claim_location_urban_area',
       'object_make', 'ph_firstname', 'ph_gender', 'ph_name',
       'policy_fleet_flag', 'policy_insured_amount', 'policy_profitability',
       'label', 'diff_days_between_claim_occ_report',
       'claim_amount_claimed_total_new', 'Age_of_vehicle', 'time_since_claim'],
      dtype='object')

In [33]:
val_data.shape

(20000, 15)

In [37]:
val_data

Unnamed: 0,sys_claimid,claim_causetype,claim_location_urban_area,object_make,ph_firstname,ph_gender,ph_name,policy_fleet_flag,policy_insured_amount,policy_profitability,label,diff_days_between_claim_occ_report,claim_amount_claimed_total_new,Age_of_vehicle,time_since_claim
4493,MTR-845197269-02,Animals,0,VOLKSWAGEN,Simon,M,Baker,1,43777.0,High,0,6.0,9388.0,4.0,10.160373
708,MTR-277591109-02,Collision,0,AUDI,Melissa,other,Newcomb,0,,High,0,6.0,360.0,5.0,10.160373
18898,MTR-783277250-02,Collision,0,OTHER,Kathy,F,Thapa,0,59590.0,Low,0,0.0,9349.0,0.0,10.160373
4646,MTR-128255363-02,Collision,1,OPEL,Christian,other,Graham,1,50808.0,Very high,0,2.0,4063.0,1.0,10.160373
14657,MTR-251228076-02,Collision,0,VOLKSWAGEN,Jonathan,M,Ortega,0,,Very high,0,1.0,79.0,0.0,10.160373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16336,MTR-814990146-02,Collision,1,OTHER,Jaleela,F,el-Shahidi,0,30919.0,High,0,4.0,1693.0,6.0,4.555877
2636,MTR-890548432-02,Collision,1,OPEL,Jesse,other,De Leo,0,,Very high,0,24.0,106.0,8.0,4.553139
11649,MTR-273126648-02,Collision,1,OPEL,Keith,M,Jackson,0,,High,0,0.0,33.0,6.0,4.553139
17020,MTR-683897520-02,Other,0,OPEL,David,M,Arnold,1,,High,0,7.0,447.0,13.0,4.553139


In [36]:
val_data.to_csv('cleaned_validation_data.csv')

In [38]:
val_data.head()

Unnamed: 0,sys_claimid,claim_causetype,claim_location_urban_area,object_make,ph_firstname,ph_gender,ph_name,policy_fleet_flag,policy_insured_amount,policy_profitability,label,diff_days_between_claim_occ_report,claim_amount_claimed_total_new,Age_of_vehicle,time_since_claim
4493,MTR-845197269-02,Animals,0,VOLKSWAGEN,Simon,M,Baker,1,43777.0,High,0,6.0,9388.0,4.0,10.160373
708,MTR-277591109-02,Collision,0,AUDI,Melissa,other,Newcomb,0,,High,0,6.0,360.0,5.0,10.160373
18898,MTR-783277250-02,Collision,0,OTHER,Kathy,F,Thapa,0,59590.0,Low,0,0.0,9349.0,0.0,10.160373
4646,MTR-128255363-02,Collision,1,OPEL,Christian,other,Graham,1,50808.0,Very high,0,2.0,4063.0,1.0,10.160373
14657,MTR-251228076-02,Collision,0,VOLKSWAGEN,Jonathan,M,Ortega,0,,Very high,0,1.0,79.0,0.0,10.160373


In [39]:
val_data.sys_claimid.unique()

array(['MTR-845197269-02', 'MTR-277591109-02', 'MTR-783277250-02', ...,
       'MTR-273126648-02', 'MTR-683897520-02', 'MTR-367187208-02'],
      dtype=object)

In [40]:
# converting the claim id into a string to extract the desired numbers from the id
# checking the length of claim id to make sure all the claim id's have the same length
val_data['len_claimid']=  val_data['sys_claimid'].str.len() 
val_data['len_claimid'].unique()

array([16])

In [41]:
val_data['new_claimid'] = val_data['sys_claimid'].str[4:13]
val_data['new_claimid'].head()

4493     845197269
708      277591109
18898    783277250
4646     128255363
14657    251228076
Name: new_claimid, dtype: object

In [42]:
val_data.head()

Unnamed: 0,sys_claimid,claim_causetype,claim_location_urban_area,object_make,ph_firstname,ph_gender,ph_name,policy_fleet_flag,policy_insured_amount,policy_profitability,label,diff_days_between_claim_occ_report,claim_amount_claimed_total_new,Age_of_vehicle,time_since_claim,len_claimid,new_claimid
4493,MTR-845197269-02,Animals,0,VOLKSWAGEN,Simon,M,Baker,1,43777.0,High,0,6.0,9388.0,4.0,10.160373,16,845197269
708,MTR-277591109-02,Collision,0,AUDI,Melissa,other,Newcomb,0,,High,0,6.0,360.0,5.0,10.160373,16,277591109
18898,MTR-783277250-02,Collision,0,OTHER,Kathy,F,Thapa,0,59590.0,Low,0,0.0,9349.0,0.0,10.160373,16,783277250
4646,MTR-128255363-02,Collision,1,OPEL,Christian,other,Graham,1,50808.0,Very high,0,2.0,4063.0,1.0,10.160373,16,128255363
14657,MTR-251228076-02,Collision,0,VOLKSWAGEN,Jonathan,M,Ortega,0,,Very high,0,1.0,79.0,0.0,10.160373,16,251228076


In [43]:
categoricals = val_data.select_dtypes(np.object)
categoricals.head()

Unnamed: 0,sys_claimid,claim_causetype,object_make,ph_firstname,ph_gender,ph_name,policy_profitability,new_claimid
4493,MTR-845197269-02,Animals,VOLKSWAGEN,Simon,M,Baker,High,845197269
708,MTR-277591109-02,Collision,AUDI,Melissa,other,Newcomb,High,277591109
18898,MTR-783277250-02,Collision,OTHER,Kathy,F,Thapa,Low,783277250
4646,MTR-128255363-02,Collision,OPEL,Christian,other,Graham,Very high,128255363
14657,MTR-251228076-02,Collision,VOLKSWAGEN,Jonathan,M,Ortega,Very high,251228076


In [44]:
## Columns to drop before dummifying for the model
col_drop = ['sys_claimid','ph_firstname','ph_name']

In [45]:
categoricals.drop(columns=col_drop, axis = 1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [46]:
categoricals.head()

Unnamed: 0,claim_causetype,object_make,ph_gender,policy_profitability,new_claimid
4493,Animals,VOLKSWAGEN,M,High,845197269
708,Collision,AUDI,other,High,277591109
18898,Collision,OTHER,F,Low,783277250
4646,Collision,OPEL,other,Very high,128255363
14657,Collision,VOLKSWAGEN,M,Very high,251228076


In [51]:
categoricals = categoricals.drop('new_claimid',axis = 1)

In [50]:
### Dummifing the cleaned categorical data
categoricals = pd.get_dummies(categoricals, columns = ['claim_causetype', 'object_make', 'ph_gender', 'policy_profitability'],drop_first=True)
categoricals.head()

Unnamed: 0,new_claimid,claim_causetype_Collision,claim_causetype_Other,claim_causetype_Theft,claim_causetype_Weather,claim_causetype_missing,object_make_BMW,object_make_CITROEN,object_make_OPEL,object_make_OTHER,object_make_RENAULT,object_make_VOLKSWAGEN,ph_gender_M,ph_gender_other,policy_profitability_Low,policy_profitability_Neutral,policy_profitability_Very high,policy_profitability_Very low
4493,845197269,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
708,277591109,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
18898,783277250,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
4646,128255363,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0
14657,251228076,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0


In [52]:
categoricals.head()

Unnamed: 0,claim_causetype_Collision,claim_causetype_Other,claim_causetype_Theft,claim_causetype_Weather,claim_causetype_missing,object_make_BMW,object_make_CITROEN,object_make_OPEL,object_make_OTHER,object_make_RENAULT,object_make_VOLKSWAGEN,ph_gender_M,ph_gender_other,policy_profitability_Low,policy_profitability_Neutral,policy_profitability_Very high,policy_profitability_Very low
4493,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
708,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
18898,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
4646,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0
14657,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0


In [53]:
numericals = val_data.select_dtypes(np.number)
numericals.head()

Unnamed: 0,claim_location_urban_area,policy_fleet_flag,policy_insured_amount,label,diff_days_between_claim_occ_report,claim_amount_claimed_total_new,Age_of_vehicle,time_since_claim,len_claimid
4493,0,1,43777.0,0,6.0,9388.0,4.0,10.160373,16
708,0,0,,0,6.0,360.0,5.0,10.160373,16
18898,0,0,59590.0,0,0.0,9349.0,0.0,10.160373,16
4646,1,1,50808.0,0,2.0,4063.0,1.0,10.160373,16
14657,0,0,,0,1.0,79.0,0.0,10.160373,16


In [54]:
numericals = numericals.drop('len_claimid',axis=1)
numericals.head()

Unnamed: 0,claim_location_urban_area,policy_fleet_flag,policy_insured_amount,label,diff_days_between_claim_occ_report,claim_amount_claimed_total_new,Age_of_vehicle,time_since_claim
4493,0,1,43777.0,0,6.0,9388.0,4.0,10.160373
708,0,0,,0,6.0,360.0,5.0,10.160373
18898,0,0,59590.0,0,0.0,9349.0,0.0,10.160373
4646,1,1,50808.0,0,2.0,4063.0,1.0,10.160373
14657,0,0,,0,1.0,79.0,0.0,10.160373


In [55]:
numericals.shape

(20000, 8)

In [56]:
## Checking for nulls
numericals.isna().sum()

claim_location_urban_area                0
policy_fleet_flag                        0
policy_insured_amount                 7703
label                                    0
diff_days_between_claim_occ_report       0
claim_amount_claimed_total_new           0
Age_of_vehicle                           0
time_since_claim                         0
dtype: int64

In [57]:
## Using the average of the column to fill the na values
numericals['policy_insured_amount'] = numericals['policy_insured_amount'].fillna(np.mean(numericals['policy_insured_amount']))

In [58]:
numericals.isna().sum()

claim_location_urban_area             0
policy_fleet_flag                     0
policy_insured_amount                 0
label                                 0
diff_days_between_claim_occ_report    0
claim_amount_claimed_total_new        0
Age_of_vehicle                        0
time_since_claim                      0
dtype: int64

In [59]:
### Final dataset for using further
validation_dataset_dummified = pd.concat((categoricals,numericals),axis = 1)

In [60]:
validation_dataset_dummified.head()

Unnamed: 0,claim_causetype_Collision,claim_causetype_Other,claim_causetype_Theft,claim_causetype_Weather,claim_causetype_missing,object_make_BMW,object_make_CITROEN,object_make_OPEL,object_make_OTHER,object_make_RENAULT,...,policy_profitability_Very high,policy_profitability_Very low,claim_location_urban_area,policy_fleet_flag,policy_insured_amount,label,diff_days_between_claim_occ_report,claim_amount_claimed_total_new,Age_of_vehicle,time_since_claim
4493,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,43777.0,0,6.0,9388.0,4.0,10.160373
708,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,60182.346589,0,6.0,360.0,5.0,10.160373
18898,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,59590.0,0,0.0,9349.0,0.0,10.160373
4646,1,0,0,0,0,0,0,1,0,0,...,1,0,1,1,50808.0,0,2.0,4063.0,1.0,10.160373
14657,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,60182.346589,0,1.0,79.0,0.0,10.160373


In [61]:
validation_dataset_dummified.to_csv('validation_dataset_dummified.csv')