In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

In [8]:
#Display all the rows and columns upto 100 observation
pd.set_option('display.max_rows',100)
pd.set_option('display.max_columns',100)

In [9]:
train.shape

(58592, 44)

In [10]:
test.shape

(39063, 43)

In [11]:
train.head()

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,max_torque,max_power,engine_type,airbags,is_esc,is_adjustable_steering,is_tpms,is_parking_sensors,is_parking_camera,rear_brakes_type,displacement,cylinder,transmission_type,gear_box,steering_type,turning_radius,length,width,height,gross_weight,is_front_fog_lights,is_rear_window_wiper,is_rear_window_washer,is_rear_window_defogger,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim
0,ID00001,0.515874,0.05,0.644231,C1,4990,1,A,M1,CNG,60Nm@3500rpm,40.36bhp@6000rpm,F8D Petrol Engine,2,No,No,No,Yes,No,Drum,796,3,Manual,5,Power,4.6,3445,1515,1475,1185,No,No,No,No,No,No,No,Yes,No,No,No,Yes,0,0
1,ID00002,0.672619,0.02,0.375,C2,27003,1,A,M1,CNG,60Nm@3500rpm,40.36bhp@6000rpm,F8D Petrol Engine,2,No,No,No,Yes,No,Drum,796,3,Manual,5,Power,4.6,3445,1515,1475,1185,No,No,No,No,No,No,No,Yes,No,No,No,Yes,0,0
2,ID00003,0.84111,0.02,0.384615,C3,4076,1,A,M1,CNG,60Nm@3500rpm,40.36bhp@6000rpm,F8D Petrol Engine,2,No,No,No,Yes,No,Drum,796,3,Manual,5,Power,4.6,3445,1515,1475,1185,No,No,No,No,No,No,No,Yes,No,No,No,Yes,0,0
3,ID00004,0.900277,0.11,0.432692,C4,21622,1,C1,M2,Petrol,113Nm@4400rpm,88.50bhp@6000rpm,1.2 L K12N Dualjet,2,Yes,Yes,No,Yes,Yes,Drum,1197,4,Automatic,5,Electric,4.8,3995,1735,1515,1335,Yes,No,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2,0
4,ID00005,0.596403,0.11,0.634615,C5,34738,2,A,M3,Petrol,91Nm@4250rpm,67.06bhp@5500rpm,1.0 SCe,2,No,No,No,No,Yes,Drum,999,3,Automatic,5,Electric,5.0,3731,1579,1490,1155,No,No,No,No,No,Yes,Yes,Yes,No,Yes,Yes,Yes,2,0


In [13]:
#This shows the data set is heavily imbalanced
train['is_claim'].value_counts()/train.shape[0] * 100

0    93.603222
1     6.396778
Name: is_claim, dtype: float64

In [14]:
train.isnull().sum()

policy_id                           0
policy_tenure                       0
age_of_car                          0
age_of_policyholder                 0
area_cluster                        0
population_density                  0
make                                0
segment                             0
model                               0
fuel_type                           0
max_torque                          0
max_power                           0
engine_type                         0
airbags                             0
is_esc                              0
is_adjustable_steering              0
is_tpms                             0
is_parking_sensors                  0
is_parking_camera                   0
rear_brakes_type                    0
displacement                        0
cylinder                            0
transmission_type                   0
gear_box                            0
steering_type                       0
turning_radius                      0
length      

In [15]:
train.duplicated().sum()

0

In [16]:
#we need to drop some features that are not important for our analysis and then divide the i/op features
#the output feature is assigned to Y column.
X=train.drop(columns=['policy_id','is_claim',"area_cluster","make",
                     "model","fuel_type","airbags",
                      "cylinder"])
y=train['is_claim']

In [17]:
#Dropping the same features from test set also.
policy_id=test['policy_id']
test=test.drop(['policy_id',"area_cluster","make",
                     "model","fuel_type","airbags",
                      "cylinder"],axis=1)

In [18]:
X=pd.get_dummies(X,drop_first=True)


In [19]:
test=pd.get_dummies(test,drop_first=True)

In [20]:
X.shape

(58592, 64)

In [21]:
## RandomOverSampler to handle imbalanced data

from imblearn.over_sampling import RandomOverSampler

In [22]:
os =  RandomOverSampler(random_state=42, sampling_strategy=0.8)

In [23]:
X_res, y_res = os.fit_resample(X, y)

In [24]:
X_res.shape,y_res.shape

((98719, 64), (98719,))

In [25]:
#Scaling the data set
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_res=scaler.fit_transform(X_res)
test=scaler.transform(test)

In [26]:
from collections import Counter
print('Original dataset shape {}'.format(Counter(y)))
print('Resampled dataset shape {}'.format(Counter(y_res)))

Original dataset shape Counter({0: 54844, 1: 3748})
Resampled dataset shape Counter({0: 54844, 1: 43875})


In [27]:
from sklearn.model_selection import train_test_split

In [28]:
#Test-train splitting
X_train,X_test,y_train,y_test=train_test_split(X_res,y_res,test_size = 0.2, random_state = 1)

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBRFClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
import lightgbm

from sklearn.metrics import f1_score

In [30]:
#Tuning the hyper parameters for Randomforest classifier
clf=RandomForestClassifier(n_estimators=1000,
                         criterion='gini',
                         max_depth=12,
                         max_features='log2',
                         min_samples_leaf=1,
                         min_samples_split=5,
                         random_state=1)

In [31]:
# Train Model
clf.fit(X_train,y_train)

# Training set performance
train_pred=clf.predict(X_train)
train_accuracy=f1_score(y_train,train_pred)

#Testing set performance

test_pred=clf.predict(X_test)
test_accuracy=f1_score(y_test,test_pred)


print('Accuracy for Training set is')
print( 100*train_accuracy)
print('----------------------------------')
print('Accuracy for Testing set is')
print( 100*test_accuracy)

Accuracy for Training set is
77.4588734320278
----------------------------------
Accuracy for Testing set is
74.41494149414942


In [60]:
y_pred=clf.predict(test)

In [61]:
#Creating the submission file
submission = pd.DataFrame()
submission['policy_id'] = policy_id
submission['is_claim'] = y_pred
submission.to_csv('submission3.csv', index=None)

In [62]:
submission.shape

(39063, 2)

In [63]:
submission

Unnamed: 0,policy_id,is_claim
0,ID58593,1
1,ID58594,0
2,ID58595,0
3,ID58596,0
4,ID58597,0
...,...,...
39058,ID97651,1
39059,ID97652,1
39060,ID97653,0
39061,ID97654,0
