In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df  = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [3]:
df.shape  

(381109, 12)

In [4]:
df.columns

Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')

In [5]:
df=df.drop("id",axis=1)

In [6]:
df['Driving_License']=df['Driving_License'].astype('object')
df['Previously_Insured']=df['Previously_Insured'].astype('object')
df['Response']=df['Response'].astype('object')

# Data Cleaning

In [7]:
def clean_data(df):
    """to clean the data pass the dataframe itself """
    print("Columns with all null values are")
    print(df.columns[df.isnull().all()])
    
    # Drop columns which have all NaN values
    c=df.columns[df.isnull().all()]
    df.drop(c, inplace=True, axis=1)
    
    # Drop Columns which have more than 90% NAs
    df.dropna(axis=1, thresh=int(0.1 * df.shape[0]),inplace=True)
    
    # Find rows with missing values greater than 50%
    print(df.isnull().sum(axis=1))
    # Drop rows with missing values greater than 50%
    df = df[df.isnull().sum(axis=1) <=(df.shape[1] * 0.5) ]
    
    return df
    
    

In [8]:
clean_data(df)

Columns with all null values are
Index([], dtype='object')
0         0
1         0
2         0
3         0
4         0
         ..
381104    0
381105    0
381106    0
381107    0
381108    0
Length: 381109, dtype: int64


Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0
...,...,...,...,...,...,...,...,...,...,...,...
381104,Male,74,1,26.0,1,1-2 Year,No,30170.0,26.0,88,0
381105,Male,30,1,37.0,1,< 1 Year,No,40016.0,152.0,131,0
381106,Male,21,1,30.0,1,< 1 Year,No,35118.0,160.0,161,0
381107,Female,68,1,14.0,0,> 2 Years,Yes,44617.0,124.0,74,0


In [9]:
df_num=df.select_dtypes(exclude='object')
df_cat=df.select_dtypes(include='object')

df_num.columns , df_cat.columns

(Index(['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel',
        'Vintage'],
       dtype='object'),
 Index(['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age',
        'Vehicle_Damage', 'Response'],
       dtype='object'))

In [10]:
from sklearn.impute import KNNImputer
def fill_numeric_data(df,neighbors = 2):
    """ provide dataframe and neighbors , by default it is 2 """
    imputer = KNNImputer(n_neighbors=neighbors, weights="uniform")
    cols = df.columns
    filled_array = imputer.fit_transform(df)
    df_filled = pd.DataFrame(filled_array, columns = cols)
    return df_filled

In [11]:
fill_numeric_data(df_num)

Unnamed: 0,Age,Region_Code,Annual_Premium,Policy_Sales_Channel,Vintage
0,44.0,28.0,40454.0,26.0,217.0
1,76.0,3.0,33536.0,26.0,183.0
2,47.0,28.0,38294.0,26.0,27.0
3,21.0,11.0,28619.0,152.0,203.0
4,29.0,41.0,27496.0,152.0,39.0
...,...,...,...,...,...
381104,74.0,26.0,30170.0,26.0,88.0
381105,30.0,37.0,40016.0,152.0,131.0
381106,21.0,30.0,35118.0,160.0,161.0
381107,68.0,14.0,44617.0,124.0,74.0


In [12]:
df_num=df_num.drop(['Vintage','Policy_Sales_Channel','Region_Code'],axis=1)

In [13]:
df_cat.columns , df_num.columns

(Index(['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age',
        'Vehicle_Damage', 'Response'],
       dtype='object'),
 Index(['Age', 'Annual_Premium'], dtype='object'))

# Normalization

In [14]:
from sklearn.preprocessing import PowerTransformer
def normalize_num_data(df_num):
    pt=PowerTransformer()
    df_num_pt=pt.fit_transform(df_num)
    df_num_pt=pd.DataFrame(df_num_pt)
    df_num_pt.columns=df_num.columns
    return df_num_pt
def encode_cat_data(df_cat):    
    df_cat=df_cat.drop('Response',axis=1)
    df_cat_dum=pd.get_dummies(df_cat,columns=['Gender','Driving_License','Previously_Insured','Vehicle_Damage'],drop_first=True)
    v_age = {'> 2 Years':0, '< 1 Year':1, '1-2 Year':2}
    df_cat_dum['Vehicle_Age'] = df['Vehicle_Age'].map(lambda x : v_age[x])
    df_cat_dum.rename(columns={'Gender_Male':'Gender', 'Driving_License_1':'Driving_License','Previously_Insured_1':'Previously_Insured', 'Vehicle_Damage_Yes':'Vehicle_Damage'},inplace=True)
    return df_cat_dum

In [15]:
normalize_num_data(df_num).head()

Unnamed: 0,Age,Annual_Premium
0,0.571865,0.612207
1,1.765186,0.243028
2,0.728579,0.498644
3,-1.452684,-0.030214
4,-0.506661,-0.094107


In [16]:
encode_cat_data(df_cat).head()

Unnamed: 0,Vehicle_Age,Gender,Driving_License,Previously_Insured,Vehicle_Damage
0,0,1,1,0,1
1,2,1,1,0,0
2,0,1,1,0,1
3,1,1,1,1,0
4,1,0,1,1,0


In [17]:
df1 = pd.concat([encode_cat_data(df_cat),normalize_num_data(df_num)],axis=1)
df1

Unnamed: 0,Vehicle_Age,Gender,Driving_License,Previously_Insured,Vehicle_Damage,Age,Annual_Premium
0,0,1,1,0,1,0.571865,0.612207
1,2,1,1,0,0,1.765186,0.243028
2,0,1,1,0,1,0.728579,0.498644
3,1,1,1,1,0,-1.452684,-0.030214
4,1,0,1,1,0,-0.506661,-0.094107
...,...,...,...,...,...,...,...
381104,2,1,1,1,0,1.712271,0.057079
381105,1,1,1,1,0,-0.413070,0.589296
381106,1,1,1,1,0,-1.452684,0.328903
381107,0,0,1,0,1,1.541047,0.827191


In [18]:
df1['Response'] = df['Response']

In [19]:
X=pd.concat([encode_cat_data(df_cat),normalize_num_data(df_num)],axis=1)
y=df['Response']

# Data Imbalance Treatment using smote NC

In [20]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTENC

In [21]:
def balance_data(X,y):
    y=y.astype('int64')
    xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.3,stratify=y)
    smotenc = SMOTENC([0,1,2,3,4,5])
    X_oversample,y_oversample = smotenc.fit_resample(xtrain,ytrain)
    print(y_oversample.value_counts())
    
    return X_oversample,y_oversample,xtest,ytest
    

In [22]:
X_oversample,y_oversample,xtest,ytest = balance_data(X,y)

1    234079
0    234079
Name: Response, dtype: int64


In [24]:
xtest[0:3]

Unnamed: 0,Vehicle_Age,Gender,Driving_License,Previously_Insured,Vehicle_Damage,Age,Annual_Premium
60086,2,1,1,1,1,1.712271,0.52886
26458,2,0,1,0,1,-0.000884,0.630109
197738,1,0,1,0,1,-0.155656,0.368439


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  accuracy_score , classification_report , confusion_matrix , plot_roc_curve , f1_score
from sklearn.metrics import precision_recall_fscore_support as score
import pickle

In [26]:
from sklearn.linear_model import LogisticRegression
#from sklearn.model_selection import KFold,cross_val_score
#log=LogisticRegression()
#log.fit(X_oversample,y_oversample)
#pickle.dump(log, open('model.pkl','wb'))

#model = pickle.load(open('model.pkl','rb'))
#print(model.predict([[4, 300, 500]]))
#ypred=model.predict(xtest[1:2])
#print("ypred",ypred)
        
from sklearn.ensemble import GradientBoostingClassifier
GBoost=GradientBoostingClassifier(n_estimators=100)
GBoost.fit(X_oversample,y_oversample)

pickle.dump(GBoost, open('gboost_model.pkl','wb'))

model1 = pickle.load(open('gboost_model.pkl','rb'))
ypred=model1.predict(xtest)
print("ypred",ypred)

ypred [0 1 1 ... 0 0 0]


In [28]:
ypred[1]

1

# Flask Implementation 

In [29]:
from flask import Flask, request,render_template
import pickle
import pandas as pd
#from Normalization import normalize_num_data

app = Flask(__name__)
model = pickle.load(open('gboost_model.pkl', 'rb'))

@app.route('/')
def home():
    return render_template("home1.html")

@app.route('/prediction',methods=['POST'])
def prediction():
    if(request.method == "POST"):

        data = [request.form.values()]
        final_data = pd.DataFrame(data,columns=['Vehicle_Age','Gender','Driving_Licence','Previously_Insured','Vehicle_Damage','Age','Annual_Premium'])
        predi = model.predict(final_data)
        print(predi)
 
            
           
            
    return render_template("home1.html",prediction_text ="Prediction : {}".format("Yes\n Customer might buy Insurance" if predi==1 else "No\n Customer might not buy Insurance"))
        

if __name__ == "__main__":
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [03/Feb/2021 15:00:01] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Feb/2021 15:00:01] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
127.0.0.1 - - [03/Feb/2021 15:00:18] "[37mPOST /prediction HTTP/1.1[0m" 200 -


[1]


127.0.0.1 - - [03/Feb/2021 15:00:55] "[37mPOST /prediction HTTP/1.1[0m" 200 -


[1]


127.0.0.1 - - [03/Feb/2021 15:01:22] "[37mPOST /prediction HTTP/1.1[0m" 200 -


[0]
