In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import seaborn as sns
sns.set_theme(style="ticks", color_codes=True)

In [None]:
#load data
df = pd.read_csv('/kaggle/input/invehicle-coupon-recommendation/in-vehicle-coupon-recommendation.csv')

df.shape #lets check the dimensionality of the raw data

# Data Exploration
Let's take a peek into the data and explore the data and its variables. The dataset is a supervised learning dataset with over 12000 instances and 26 attributes; this mean there is an input variable X and an out variable y.  

In [None]:
#load the data to understand the attributes and data types
df.head()

In [None]:
#let's look at the data types
df.dtypes 

It seems that the data has some few numberical datatypes and the rest are string objects, however all the data can be categorized as being categorical datatypes with a mix of binary and ordinal datatypes.

In [None]:
#change temperature into a category as its an ordinal datatype
df['temperature']=df['temperature'].astype('category')

# Cleaning The Data

In [None]:
#check for empty values
df.info()

There are some missing values in several columns, and the 'car' variable has only 108 non-null values, more than 99% of the values are NaN. We can just drop it off. These variables are insufficient so its best to remove it completely from the data to avoid inaccuracies in the modeling.

In [None]:
df["car"].value_counts()

In [None]:
df.drop('car', inplace=True, axis=1)

Empty values in categorical data can be removed or replaced with the most frequent value in each column.

Lets iterate through the pandas table and get all the columns with empty or NaN values, and then for each column the code is going to find the largest variable count and fill the empty values with the corresponding variable with maximum count.

In [None]:
for x in df.columns[df.isna().any()]:
    df = df.fillna({x: df[x].value_counts().idxmax()})

In [None]:
#change Object datatypes to Categorical datatypes)

df_obj = df.select_dtypes(include=['object']).copy()

for col in df_obj.columns:
    df[col]=df[col].astype('category')
    
df.dtypes

In [None]:
#lets do some statistcal analysis
df.describe(include='all')

In [None]:
df.select_dtypes('int64').nunique()

From the decription above we can tell that 'toCoupon_GEQ5min' has only one unique variable which won't help much in the encoding of the categorical variables. Therefore, its better to drop that column. 

In [None]:
df.drop(columns=['toCoupon_GEQ5min'], inplace=True)

Let's plot the distribution charts of all the categorical datatypes.

In [None]:
fig, axes = plt.subplots(9, 2, figsize=(20,50))
axes = axes.flatten()

for ax, col in zip(axes, df.select_dtypes('category').columns):
    sns.countplot(y=col, data=df, ax=ax, 
                  palette="ch:.25", order=df[col].value_counts().index);

plt.tight_layout()
plt.show()

We are going to create feature vectors for our modeling by using the LabelEnconder and OneHotEncoder.

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

enc = OneHotEncoder(dtype='int64')

df_cat = df.select_dtypes(include=['category']).copy()
df_int = df.select_dtypes(include=['int64']).copy()

df_enc = pd.DataFrame()
for col in df_cat.columns:
    enc_results = enc.fit_transform(df_cat[[col]])
    df0 = pd.DataFrame(enc_results.toarray(), columns=enc.categories_)
    df_enc = pd.concat([df_enc,df0], axis=1)
    
df_final = pd.concat([df_enc, df_int], axis=1)

#source: https://pbpython.com/categorical-encoding.html

In [None]:
df_final

# Data Modeling

In [None]:
import sklearn as sk
from sklearn import metrics
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
#split data into training and test set
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df_final, test_size=.3, random_state=42, shuffle=True, stratify=df_final['Y'])

In [None]:
#Creating the Dependent Feature Matrix
X_train = train_set.iloc[:, :-1].values
X_test = test_set.iloc[:, :-1].values

#Creating the Independent Vector
y_train = train_set.iloc[:, -1].values
y_test = test_set.iloc[:, -1].values

## Logistic Regression

In [None]:
LR = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train)

#Predict the response for test dataset
y_pred_LR = LR.predict(X_test)

print(classification_report(y_test, y_pred_LR))

## Decision Tree

In [None]:
DTC = DecisionTreeClassifier().fit(X_train, y_train)

#Predict the response for test dataset
y_pred_DTC = DTC.predict(X_test)

print(classification_report(y_test, y_pred_DTC))

## K-Nearest Neighbors

In [None]:
KNN = KNeighborsClassifier().fit(X_train, y_train)

#Predict the response for test dataset
y_pred_KNN = KNN.predict(X_test)

print(classification_report(y_test, y_pred_KNN))

## Linear Discriminant Analysis

In [None]:
LDA = LinearDiscriminantAnalysis().fit(X_train, y_train)

#Predict the response for test dataset
y_pred_LDA = LDA.predict(X_test)

print(classification_report(y_test, y_pred_LDA))

## Gaussian Naive Bayes

In [None]:
GNB = GaussianNB().fit(X_train, y_train)

#Predict the response for test dataset
y_pred_GNB = GNB.predict(X_test)

print(classification_report(y_test, y_pred_GNB))

## Support Vector Machine

In [None]:
SVM = SVC(kernel="rbf", random_state=None, probability=True, cache_size=500, gamma=0.1).fit(X_train, y_train)

#Predict the response for test dataset
y_pred_SVM = SVM.predict(X_test)

print(classification_report(y_test, y_pred_SVM))

SVM has the highest accuracy amongst the other models. I chose to use the RBF kernel as we do not know if the data is linear separable or not.  

For machine learning models, achieving a good prediction model is extremely crucial. It involves achieving a balance between underfitting and overfitting or ie. a trade-off between bias and variance.
<p>
When it comes to classification, the precision-recall trade-off is a fundamentally important metric to investigate.
</p>
<p>
Precision is the ratio between the True Positive and all Positives. For this problem it would be the measure of drivers that were correctly identified using a coupon out of all the drivers actually using it.
</p>
<p>
Recall is the measure of our model correctly identifying True Positives. Thus all the drivers who actually accepted a coupon, recall tells how many we correctly identified as accepting a coupon.
</p>

In [None]:
from sklearn.metrics import confusion_matrix

cm_SVM = confusion_matrix(y_test,y_pred_SVM)
pd.crosstab(y_test, y_pred_SVM, rownames = ['Truth'], colnames =['Predicted'], margins = True)

In [None]:
def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14):
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names, 
    )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('Truth')
    plt.xlabel('Prediction')

print_confusion_matrix(cm_SVM,["Coupon Not Accepted", "Coupon Accepted"])

A confusion matrix helps us gain an insight into how correct our predictions were and how they hold up against the actual values.

From our train and test data, we already know that our test data consisted of 3806 data points. That is the 3rd row and 3rd column value at the end. We also notice that there are some actual and predicted values. The actual values are the number of data points that were originally categorized into 0 or 1. The predicted values are the number of data points SVM model predicted as 0 or 1.<br>

The actual values are:
<ul>
    <li>The drivers who actually did not use a coupon = 1643</li>
    <li>The drivers who actually did use a coupon = 2163</li>
</ul><br>
The predicted values are:
<ul>
    <li>Number of drivers who were predicted as not using a coupon = 1401
    <li>Number of drivers who were predicted as using a coupon = 2396
</ul>

All the values we obtain above have a term. Let’s go over them one by one:
<ol>
    <li>The cases in which the drivers actually did not use a coupon and our model also predicted as not using one is called the <b>True Negatives</b>. For our matrix, True Negatives = 1056.
    <li>The cases in which the drivers actually used a coupon and our model also predicted as using one are called the <b>True Positives</b>. For our matrix, True Positives = 1809
    <li>However, there are are some cases where the drivers actually did not use a coupon, but our model has predicted that they did use one. This kind of error is the <b>Type I Error</b> and we call the values as <b>False Positives</b>. For our matrix, False Positives = 587
    <li>Similarly, there are are some cases where the drivers actually used a coupon, but our model has predicted that they did not use one. This kind of error is the <b>Type II Error</b> and we call the values as <b>False Negatives</b>.  For our matrix, False Negatives = 354
</ol>

## ROC Curves(Receiver Operating Characteristic Curve)

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

y_pred_proba = SVM.predict_proba(X_test)[:,1]
roc_auc = metrics.roc_auc_score(y_test, y_pred_proba)
#print(f'Receiver Operating Characteristic AUC = {roc_auc}')

plt.figure(figsize = (10,8))
# plot no skill roc curve
plt.plot([0, 1], [0, 1], linestyle='--', label='No Skill')
# calculate roc curve for model
fpr, tpr, thresholds_ = roc_curve(y_test, y_pred_proba)
# plot model roc curve
plt.plot(fpr, tpr, marker='.', label='AUC = '+str(roc_auc))
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve')
# show the legend
plt.legend(loc=4)
# show the plot
plt.show()

## Precision-Recall Curve (PRC)

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

plt.figure(figsize = (10,8))
plt.plot([0, 1], [0.5, 0.5],'k--', label = 'No Skill')
prc_auc = auc(recall, precision)
plt.plot(recall, precision, label = 'Precision-Recall AUC = '+str(prc_auc))
plt.legend(loc=1)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

<ul>
<li>At the lowest point (0,0), the threshold is set to 1 which means the model make no distinctions between drivers who use coupons and drivers who do not use coupons.
<li>The highest point (1,1), the threshold is set at 0 which means that both precision and recall are high and the model makes distinctions perfectly. 
<li>The rest of the curve is the values of Precision and Recall for the threshold values between 0 and 1. Our aim is to make the curve as close to (1, 1) as possible- meaning a good precision and recall.
<li>Similar to ROC, the area with the curve and the axes as the boundaries is the Area Under Curve(AUC). Consider this area as a metric of a good model. The AUC ranges from 0 to 1.Therefore, we should aim for a high value of AUC.
</ul>

I just finished my Hon degree in IT with York University and this dataset it what I chose for project and decided to share the experience I had with you, so I am certainly open to new ideas and learning opportunities. 

Information Wo Wiase!