# Import libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load data

In [None]:
data = pd.read_csv('Dữ liệu 1/in-vehicle-coupon-recommendation.csv')

# Explore data

In [None]:
# Khám phá các thuộc tính
data.head()

In [None]:
# Kiểm tra kiểu dữ liệu
data.dtypes

# Clean data

In [None]:
data['temperature'] = data['temperature'].astype('category')

In [None]:
data['temperature'].dtype

In [None]:
# Kiểm tra giá trị null
data.info()

In [None]:
# There are many null values in 'car' column
data.drop('car', inplace=True, axis=1)

In [None]:
#change Object datatypes to Categorical datatypes)

df_obj = data.select_dtypes(include=['object']).copy()

for col in df_obj.columns:
    data[col] = data[col].astype('category')

data.dtypes

In [None]:
data.describe(include='all')

In [None]:
# Kiểm tra các cột int64 có giá trị unique là 1 để loại bỏ
data.select_dtypes('int64').nunique()

In [None]:
data.drop(columns=['toCoupon_GEQ5min'], inplace=True)

# Data visualization

In [None]:
fig, axes = plt.subplots(9, 2, figsize=(20, 50))
axes = axes.flatten()

for ax, col in zip(axes, data.select_dtypes('category').columns):
    sns.countplot(y=col,
                  data=data,
                  ax=ax,
                  palette="ch:.25",
                  order=data[col].value_counts().index)

plt.show()

# Preprocess data

In [None]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(dtype='int64')

data_cat = data.select_dtypes(include=['category']).copy()
df_int = data.select_dtypes(include=['int64']).copy()

data_enc = pd.DataFrame()
for col in data_cat.columns:
    enc_results = enc.fit_transform(data_cat[[col]])
    data_0 = pd.DataFrame(enc_results.toarray(), columns=enc.categories_)
    data_enc = pd.concat([data_enc, data_0], axis=1)

data_final = pd.concat([data_enc, df_int], axis=1)

In [None]:
data_final

# Split data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = data_final.drop(columns=['Y'])

In [None]:
y = data_final['Y']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

# Train model

In [None]:
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from time import time
from sklearn.metrics import accuracy_score

## Logistic Regression

In [None]:
LR = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr')

In [None]:
LR.fit(X_train, y_train)

In [72]:
t0 = time()
y_pred_LR = LR.predict(X_test)
t1 = time()
print("Prediction time: {:.2f} ms".format((t1 - t0) * 1000))

Prediction time: 3.34 ms


In [None]:
print(classification_report(y_test, y_pred_LR))

In [81]:
print('Accuracy of Logistic Regression: {:.2f}'.format(accuracy_score(y_test, y_pred_LR)))

Accuracy of Logistic Regression: 0.69


## Decision Tree

In [None]:
DTC = DecisionTreeClassifier()

In [None]:
DTC.fit(X_train, y_train)

In [73]:
t0 = time()
y_pred_DTC = DTC.predict(X_test)
t1 = time()
print("Prediction time: {:.2f} ms".format((t1 - t0) * 1000))

Prediction time: 2.50 ms


In [82]:
print(classification_report(y_test, y_pred_DTC))

              precision    recall  f1-score   support

           0       0.66      0.65      0.66      1088
           1       0.74      0.75      0.75      1449

    accuracy                           0.71      2537
   macro avg       0.70      0.70      0.70      2537
weighted avg       0.71      0.71      0.71      2537


In [83]:
print('Accuracy of Decision Tree: {:.2f}'.format(accuracy_score(y_test, y_pred_DTC)))

Accuracy of Decision Tree: 0.71


## K-Nearest Neighbors

In [None]:
KNN = KNeighborsClassifier()

In [None]:
KNN.fit(X_train, y_train)

In [74]:
t0 = time()
y_pred_KNN = KNN.predict(X_test)
t1 = time()
print("Prediction time: {:.2f} ms".format((t1 - t0) * 1000))

Prediction time: 589.50 ms


In [80]:
print(classification_report(y_test, y_pred_KNN))

              precision    recall  f1-score   support

           0       0.64      0.59      0.61      1088
           1       0.71      0.75      0.73      1449

    accuracy                           0.68      2537
   macro avg       0.67      0.67      0.67      2537
weighted avg       0.68      0.68      0.68      2537


## Gaussian Naive Bayes

In [None]:
GNB = GaussianNB()

In [None]:
GNB.fit(X_train, y_train)

In [75]:
t0 = time()
y_pred_GNB = GNB.predict(X_test)
t1 = time()
print("Prediction time: {:.2f} ms".format((t1 - t0) * 1000))

Prediction time: 8.66 ms


In [None]:
print(classification_report(y_test, y_pred_GNB))

## Support Vector Machine

In [None]:
SVM = SVC(kernel="rbf", random_state=None, probability=True, cache_size=500, gamma=0.1)

In [None]:
SVM.fit(X_train, y_train)

In [76]:
t0 = time()
y_pred_SVM = SVM.predict(X_test)
t1 = time()
print("Prediction time: {:.2f} ms".format((t1 - t0) * 1000))

Prediction time: 2914.60 ms


In [77]:
print(classification_report(y_test, y_pred_SVM))

              precision    recall  f1-score   support

           0       0.76      0.67      0.71      1088
           1       0.77      0.84      0.81      1449

    accuracy                           0.77      2537
   macro avg       0.77      0.76      0.76      2537
weighted avg       0.77      0.77      0.77      2537


Accuracy of Logistic Regression: 0.77
