In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
train_data=pd.read_csv("../input/jobathon-analytics-vidhya/train.csv")
test_data=pd.read_csv("../input/jobathon-analytics-vidhya/test.csv")

**Checking if any new values are there in test data:
Here we check whether there are any same class both in train and test class or any new class has been added.**

In [None]:
count=0
for col in train_data.columns:
    if col not in ["ID","Reco_Policy_Premium","Response"]:
        for val in test_data[col].unique():
            if val not in train_data[col].unique():
                print(col,val)
                count+=1

**Here, we got to know that Region code is having many new classes******

In [None]:
train_data["Region_Code"].nunique(),test_data["Region_Code"].nunique(),count

**There are total 225 new classes**

In [None]:
import seaborn as sns
plt.figure(figsize=(15,10))
sns.heatmap(train_data.isnull(),yticklabels=False,cmap="viridis")

**Holding_Policy_Cat,Holding_Policy_Type,Health Indicator contains Null values. So we need to check amount of missing values.**

In [None]:
# Number of of null values in each column
count=round(train_data.isnull().sum(),2)
percent=round((train_data.isnull().sum()/train_data.shape[0])*100,2)
data=pd.concat([count,percent],axis=1)
data.reset_index(inplace=True)
data.rename(columns={0: 'Missing Values Count',1: 'Missing Values %'},inplace=True)
data[data['Missing Values Count']!=0]

**Null values Imputations**

In [None]:
train_data["Health Indicator"].isnull().sum(),test_data["Health Indicator"].isnull().sum()

**While dealing with Health Indicator these indicators are taken from users. As for null values the user has not inputed any data . So we can take mode of previous values.¶**

In [None]:
train_data['Health Indicator'].fillna(train_data['Health Indicator'].mode()[0], inplace=True)
test_data['Health Indicator'].fillna(test_data['Health Indicator'].mode()[0], inplace=True)

**The null vaues for holding policy Duration & Type means there is non extisting customer.So, we can fill the NA by Zero.**

In [None]:
train_data["Holding_Policy_Duration"].fillna(0,inplace=True)
test_data["Holding_Policy_Duration"].fillna(0,inplace=True)
train_data["Holding_Policy_Type"].fillna(0,inplace=True)
test_data["Holding_Policy_Type"].fillna(0,inplace=True)
train_data["Holding_Policy_Duration"]=train_data["Holding_Policy_Duration"].replace('14+','15')

In [None]:
test_data.isnull().sum()

In [None]:
ax=plt.subplots(figsize=(8,5))
sns.set_style("whitegrid")
sns.countplot(x="Accomodation_Type",hue='Response',data=train_data)

**Both rented and owned owners shows same responses**

In [None]:
sns.catplot(x="Response",y="Reco_Policy_Premium",kind="violin",data=train_data)

In [None]:
ax = sns.countplot(x="Is_Spouse", data=train_data,
                   facecolor=(0, 0, 0, 0),
                   linewidth=5,
                   edgecolor=sns.color_palette("dark", 3))

**Applying Label Encoder to both train & test data**

In [None]:
from sklearn.preprocessing import LabelEncoder
lef=LabelEncoder()
train_data["City_Code"]=LabelEncoder().fit_transform(train_data["City_Code"])
train_data["Accomodation_Type"]=LabelEncoder().fit_transform(train_data["Accomodation_Type"])
train_data["Reco_Insurance_Type"]=lef.fit_transform(train_data["Reco_Insurance_Type"])
train_data["Is_Spouse"]=train_data["Is_Spouse"].map({"Yes":1,"No":0})
train_data["Health Indicator"]=lef.fit_transform(train_data["Health Indicator"])
train_data["Holding_Policy_Duration"]=train_data["Holding_Policy_Duration"].astype(float).astype(int)

In [None]:
print(train_data["Is_Spouse"])

In [None]:
test_data["City_Code"]=LabelEncoder().fit_transform(test_data["City_Code"])
test_data["Accomodation_Type"]=LabelEncoder().fit_transform(test_data["Accomodation_Type"])
test_data["Reco_Insurance_Type"]=lef.fit_transform(test_data["Reco_Insurance_Type"])
test_data["Is_Spouse"]=test_data["Is_Spouse"].map({"Yes":1,"No":0})
test_data["Holding_Policy_Duration"]=test_data["Holding_Policy_Duration"].replace('14+','15')
test_data["Health Indicator"]=lef.fit_transform(test_data["Health Indicator"])
test_data["Holding_Policy_Duration"]=test_data["Holding_Policy_Duration"].astype(float).astype(int)

In [None]:
X=train_data.drop('Response',axis=1)
Y=train_data["Response"]
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=4)

**Model Buildings**

In [None]:
## Hyperparameter optimization using RandomizedSearchCV,GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost

**USE HYPERPARAMETER OPTIMIZATION USING RANDOMIZED SEARCH CV TO SELECT BEST PARAMETERS FOR XGBOOSTS **

In [None]:
classifier=xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.4, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=6,
              min_child_weight=1,monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
classifier.fit(X_train,Y_train)

In [None]:
from sklearn.model_selection import cross_val_score
XGB_accuracies = cross_val_score(estimator = classifier, X = X_train, y = Y_train, cv = 10)
print("Mean_XGB_Acc : ", XGB_accuracies.mean())

In [None]:
xg_preds = classifier.predict(X_test)

In [None]:
from sklearn import metrics
print(metrics.classification_report(Y_test, xg_preds))

In [None]:
X_train.isnull().sum()

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,Y_train)


In [None]:
DT_accuracies = cross_val_score(estimator = clf, X = X_train, y = Y_train, cv = 10)
print("Mean_DT_Acc : ", DT_accuracies.mean())

In [None]:
DT_pred = clf.predict(X_test)

In [None]:
print("Accuracy:",metrics.accuracy_score(Y_test, DT_pred))

In [None]:
clf1= DecisionTreeClassifier(criterion="entropy", max_depth=3)
clf1 = clf1.fit(X_train,Y_train)
DTE_accuracies = cross_val_score(estimator = clf1, X = X_train, y = Y_train, cv = 10)
print("Mean_DTE_Acc : ", DTE_accuracies.mean())
Mean_DTE_Acc :  0.7579245872963088
DTE_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(Y_test,DTE_pred))

**KNN MODEL**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, Y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')
knn_accuracies = cross_val_score(estimator = knn, X = X_train, y = Y_train, cv = 10)
print("knn_DTE_Acc : ", knn_accuracies.mean())
knn_DTE_Acc :  0.7254399736618911
knn_pred=knn.predict(X_test)
print("KNN_Accuracy:",metrics.accuracy_score(Y_test,knn_pred))
KNN_Accuracy: 0.7281362594169669
print(metrics.classification_report(Y_test, knn_pred))

In [None]:
test_data_pred =knn.predict(test_data)
test_data_pred


**Final Submission**

In [None]:
sub_fn=test_data.iloc[:,0:1].copy()
sub_fn["Prediction"]=pd.DataFrame(test_data_pred)
sub_fn