In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# This notebook is an intro to the ML models mentioned below: 

# Basic model building for models mentioned below:
# 1. Logistic Regression
# 2. Random Forest
# 3. KNN
# 4. Naive Bayes


# **Reading the data and doing basic EDA**

In [None]:
data=pd.read_csv("/kaggle/input/health-insurance-cross-sell-prediction/train.csv")

In [None]:
data.info()

In [None]:
data.head()

checking for the null values is very important and also  checking for duplicates in the dataset.

In [None]:
dup=data.duplicated()
data[dup]

In [None]:
data.isnull().sum()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

import plotly as pt

**checking for the target variable on which the model will train,currently as we can see that the dataset is not balanced for the target variable,hence can give a biased result **.

In [None]:
data.Response.value_counts(normalize=True)

Checking the categorical variables in the dataset.

In [None]:
for i in data.columns:
    if data[i].dtype=="object":
        print(data[i].value_counts())
    

In [None]:
plt.figure(figsize=(15,15))
sns.lineplot(data["Age"],data["Annual_Premium"])

**Age and premium is sure to have a relation as we can see that with the increase in the age the premium increases.
The lowest premium is of the age group range of 20-40 and there is a rise in premium after 50 and a steady rise till 80 where there are peaks of the premium .Premium and age are very important factor when it comes to analysing insurance data**

In [None]:
sns.countplot(data["Previously_Insured"],hue=data["Vehicle_Age"])

In [None]:
sns.scatterplot(data["Annual_Premium"],data["Age"],hue=data["Region_Code"])

**Region doesn't shows a very strong connection to the Premium**

In [None]:
sns.pairplot(data)

In [None]:
cat=[]
num=[]
for i in data.columns:
    if data[i].dtype=="object":
           cat.append(i)
            
    else:
            num.append(i)


In [None]:
print(cat)
print(num)

Checking the Data in more depth for the Numerical and Categorical Data.

In [None]:
data[num].describe().T

In [None]:
data[cat].describe().T

In [None]:
plt.figure(figsize=(10,10))
data.boxplot(vert=0)
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(data.corr(),annot=True)
plt.show()

In [None]:
print(cat)

To convert the categorical data into  data appropriate to the model making the dummy values for the categorical variables since there are only a few categorical columns .If the data has more cat columns then one can use OneHot Encoding also.

In [None]:
data =pd.get_dummies(data, columns=cat,drop_first=True)

In [None]:
data.head()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(data.corr(),annot=True)

**Now Looking at the correlation that shows that Previously insured and Vehicle Damage highly correlated ,Age of Vehicle and Age of the the person is also highly correlated.**

In [None]:
X = data.drop('Response', axis=1)
y = data['Response']


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25 , random_state=1)

# Naive Bayes Model

**Naive Bayes is a classification technique based on the Bayes theorem. It is a simple but powerful algorithm for predictive modeling under supervised learning algorithms. 
The technique behind Naive Bayes is easy to understand. 
Naive Bayes has higher accuracy and speed when we have large data points.**

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

In [None]:
NB_model = GaussianNB()
NB_model.fit(X_train, y_train)

In [None]:
y_train_predict = NB_model.predict(X_train)
model_score = NB_model.score(X_train, y_train)
print(model_score)
print(metrics.confusion_matrix(y_train, y_train_predict))
print(metrics.classification_report(y_train, y_train_predict))

In [None]:
y_test.value_counts()

In [None]:
y_test_predict = NB_model.predict(X_test)
model_scoreNB = NB_model.score(X_test, y_test)
print(model_scoreNB)
print(metrics.confusion_matrix(y_test, y_test_predict))
print(metrics.classification_report(y_test, y_test_predict))

# Random forest

**Random forest, like its name implies, consists of a large number of individual decision trees that operate as an ensemble. Each individual tree in the random forest spits out a class prediction and the class with the most votes becomes our model’s prediction**

In [None]:
from sklearn.ensemble import RandomForestClassifier

RF_model=RandomForestClassifier(n_estimators=100,random_state=1)
RF_model.fit(X_train, y_train)

In [None]:
y_train_predict = RF_model.predict(X_train)
model_score =RF_model.score(X_train, y_train)
print(model_score)
print(metrics.confusion_matrix(y_train, y_train_predict))
print(metrics.classification_report(y_train, y_train_predict))

In [None]:
y_test_predict = RF_model.predict(X_test)
model_scoreRF = RF_model.score(X_test, y_test)
print(model_scoreRF)
print(metrics.confusion_matrix(y_test, y_test_predict))
print(metrics.classification_report(y_test, y_test_predict))

# Logistic Regression

**Logistic regression is a statistical model that in its basic form uses a logistic function to model a binary dependent variable, although many more complex extensions exist. In regression analysis, logistic regression (or logit regression) is estimating the parameters of a logistic model (a form of binary regression)**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
model_score = model.score(X_test, y_test)
print(model_score)

In [None]:
y_train_predict = model.predict(X_train)
model_score =model.score(X_train, y_train)
print(model_score)
print(metrics.confusion_matrix(y_train, y_train_predict))
print(metrics.classification_report(y_train, y_train_predict))

In [None]:
y_test_predict1 = model.predict(X_test)
model_scoreLR = model.score(X_test, y_test)
print(model_scoreLR)
print(metrics.confusion_matrix(y_test, y_test_predict1))
print(metrics.classification_report(y_test, y_test_predict1))

# Cross validation on Naive Baye

**Cross-validation is a resampling procedure used to evaluate machine learning models on a limited data sample. The procedure has a single parameter called k that refers to the number of groups that a given data sample is to be split into. As such, the procedure is often called k-fold cross-validation**

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(NB_model, X_train, y_train, cv=10)
scores

In [None]:
scores = cross_val_score(NB_model, X_test, y_test, cv=10)
scores

# KNN

**k-NN is a type of classification where the function is only approximated locally and all computation is deferred until function evaluation. Since this algorithm relies on distance for classification, if the features represent different physical units or come in vastly different scales then normalizing the training data can improve its accuracy dramatically**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

KNN_model=KNeighborsClassifier()
KNN_model.fit(X_train,y_train)

In [None]:
y_train_predict = KNN_model.predict(X_train)
model_score = KNN_model.score(X_train, y_train)
print(model_score)
print(metrics.confusion_matrix(y_train, y_train_predict))
print(metrics.classification_report(y_train, y_train_predict))

In [None]:
df = pd.DataFrame({'NaiveBayes': [model_scoreNB], 'RandomForeset': [model_scoreRF],'KNN':[model_score],"LogisticRegression":[model_scoreLR]})

In [None]:
df

# Conclusion:
1. All the models performed almost similarly
2. The accuracy of models doesn't gaurantee the success of the model hence we can go      ahead and tweak certain use Smote since the class to be found is less and the data       is highly imbalanced.
3. This gives a very basic idea of what models will work .
