# AI Training with all models 


In [1]:
#Imports:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import autokeras as ak
import xgboost as xgb
import catboost as cat

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE 
from tensorflow.keras.models import load_model
from sklearn.utils import resample,shuffle
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn import neural_network


In [40]:
# Methods to test the predictions

def count_points(pred, gold):
    df = pd.merge(pred, gold, on=['userID', 'itemID'], suffixes=('_pred', '_gold'))
    df['points'] = df.apply(_compute_points_for_row, axis=1)
    return df['points'].sum()

def _compute_points_for_row(row):
    y_pred, y_gold = row.prediction_pred, row.prediction_gold
    if y_pred == y_gold:
        # one point if "no order" (0) is predicted correctly; three points if order week is predicted correctly
        return 1 if y_pred == 0 else 3
    # one point if order is predicted correctly (but not the correct week), otherwise zero points
    return 1 if (y_pred > 0 and y_gold > 0) else 0

def preprocessData(df):
    labels = df["label"] # generating a label dataset
    data = df.drop(['label'], axis = 1) #dropping label
    data = df.drop(['order'], axis = 1) #dropping order
    data = df.drop(['date'], axis = 1) #dropping date
    
    enc = OneHotEncoder(handle_unknown='ignore') #onehotencoder
    cols = ["feature_1", "feature_2", "feature_4"] #columns to onehot encode
    enc.fit(data[cols])
    encoder_df = pd.DataFrame(enc.transform(data[cols]).toarray()) #encoding
    data = data[["userID", "itemID", "brand", "feature_3", "feature_5", 
                 "date_of_month", "total_purchase_times","purchased_frequency","category", "weekday",]].join(encoder_df) #merging
    
    lb = preprocessing.LabelBinarizer()
    #lenc = enc.fit(np_labels)
    lb = lb.fit(labels.to_numpy())
    labels_enc = lb.transform(labels.to_numpy())
    
    return labels_enc, data, lb

In [45]:
train_dec = pd.read_csv("train_bef_dec.csv")
print(len(train_dec))

y_train, x_train, binarizer = preprocessData(train_dec)

x_train.head()


631986


Unnamed: 0,userID,itemID,brand,feature_3,feature_5,date_of_month,total_purchase_times,purchased_frequency,category,weekday,...,8,9,10,11,12,13,14,15,16,17
0,0,20664,408,284,66,5,9,0.6,2346,4,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,11250,20664,408,284,66,19,26,0.6,2346,4,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1421,20664,408,284,66,1,40,0.6,2346,0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,17725,20664,408,284,66,1,11,0.6,2346,0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,24216,20664,408,284,66,30,3,0.6,2346,1,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [46]:
test_dec = pd.read_csv("test_dec.csv")
test_dec = test_dec.sort_values(by=["userID"])
print(len(test_dec))
y_test, x_test, binarizer = preprocessData(train_dec)

x_train.head()

113478


Unnamed: 0,userID,itemID,brand,feature_3,feature_5,date_of_month,total_purchase_times,purchased_frequency,category,weekday,...,8,9,10,11,12,13,14,15,16,17
0,0,20664,408,284,66,5,9,0.6,2346,4,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,11250,20664,408,284,66,19,26,0.6,2346,4,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1421,20664,408,284,66,1,40,0.6,2346,0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,17725,20664,408,284,66,1,11,0.6,2346,0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,24216,20664,408,284,66,30,3,0.6,2346,1,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [47]:
sub_dec = pd.read_csv("submission_dec.csv")
print(len(sub_dec))
sub_dec.head()

9745


Unnamed: 0,userID|itemID|prediction
0,0|20664|
1,0|28231|
2,13|2690|
3,15|1299|
4,15|20968|


In [48]:
sm = SMOTE(random_state=42)

x_train, y_train=  sm.fit_resample(x_train, y_train)

X_train, X_val, y_train, y_val= train_test_split(x_train, y_train, test_size=0.3, random_state=42)
print(len(X_train.shape), len(x_test.shape), len(X_val.shape))




2 2 2


In [49]:
X_train_arr = X_train.to_numpy()
X_val_arr = X_val.to_numpy()
X_test_arr = x_test.to_numpy()

y_val_n = binarizer.inverse_transform(y_val)
y_train_n = binarizer.inverse_transform(y_train)
y_test_n = binarizer.inverse_transform(y_test)

## Models

In [None]:
# Autokeras
model = ak.AutoModel(
    inputs=[ak.StructuredDataInput()],
    outputs=[ak.ClassificationHead()],
    max_trials=5,
    overwrite = True
)

model.fit(
    x=X_train_arr, y=y_train_arr, epochs=100, batch_size=64, validation_data= [X_val_arr, y_val_arr]
)


res_train_ak = model.predict(X_train_arr)
res_test_ak = model.predict(X_test_arr)
res_val_ak = model.predict(X_val_arr)


model = model.export_model()


try:
    model.save("model_autokeras_ded_class", save_format="tf")
except Exception:
    model.save("model_autokeras_dec_class.h5")

In [54]:
def baseline(classifier,x_train_dt,x_test_dt,y_train_dt,y_test_dt):
    baseline=classifier
    baseline.fit(x_train_dt,y_train_dt)
    stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    pred=baseline.predict(x_test_dt)
    print('CV score with default parameters:{}'.format(cross_val_score(classifier,x_train_dt, y_train_dt, cv=stratified_10_fold_cv, scoring='accuracy').mean()))
    print('classification_report on test set with default parameters:\n')
    print(classification_report(y_test_dt,pred))
    # cnf_matrix = confusion_matrix(y_test_dt,pred)    
    #np.set_printoptions(precision=2)
   # plot_confusion_matrix(cnf_matrix,classes=labels,title='confusion matrix:default')
    return baseline


def naivebayes (x_train,x_test,y_train,y_test):
    gnb=GaussianNB()
    gnb.fit(x_train, y_train)
    stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    accuracy=cross_val_score(gnb,x_train,y_train,cv=cv)
    print('Mean CV score is: {}'.format(accuracy.mean()))
    params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
    grid_search = GridSearchCV( estimator=gnb,param_grid=params_NB, cv=cv,verbose=1, scoring='accuracy') 
    grid_search.fit(x_train, y_train)
    print("The best parameter is:{}".format(grid_search.best_params_))
    print("The best validation score is:{}".format(grid_search.best_score_))
    gnb.set_params(var_smoothing=grid_search.best_params_)
    prediction = gnb.predict(x_test)
    print(classification_report(y_test, prediction))
    cm=confusion_matrix(y_test,prediction)    
    
    return gnb

def decisionTree(x_train,x_test,y_train,y_test):
    
    parameters={
            'criterion':['gini','entropy'],
            'max_depth':[2,3,4,5,6,7,8]
            }
    dtree=DecisionTreeClassifier()
    stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    grid_search=GridSearchCV(dtree,parameters,scoring='accuracy',cv=stratified_10_fold_cv)
    grid_search.fit(x_train,y_train) 
    print('CV score with best parameters:{}'.format(cross_val_score(grid_search.best_estimator_,x_train, y_train, cv=stratified_10_fold_cv, scoring='accuracy').mean()))
    print('best parameters:{}'.format(grid_search.best_params_))
    pred=grid_search.predict(x_test)
    print('classification_report with best parameters:\n')
    print(classification_report(y_test,pred))
    #confusion matrix
    dt_cnf_matrix = confusion_matrix(y_test,pred)    
    np.set_printoptions(precision=2)    
    
    return grid_search.best_estimator_

def KNN(x_train,x_test,y_train,y_test):
    
    parameters={
           'n_neighbors' : list(range(1,30)),
        'p': [1,2],
        'weights' : ["uniform", "distance"]
            }
    knn_2 = KNeighborsClassifier()
    stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    Grid_search_knn = GridSearchCV(knn_2, parameters, cv=stratified_10_fold_cv)
    best_model = Grid_search_knn.fit(x_test,y_test)
    #Print The value of best Hyperparameters
    cv_score=cross_val_score(Grid_search_knn.best_estimator_,x_train, y_train, cv=stratified_10_fold_cv, scoring='accuracy').mean()
    print("Mean CV score is {} with params {}".format(cv_score, Grid_search_knn.best_params_))
    pred=Grid_search_knn.predict(x_test)
    #print('classification_report on test set with best parameters:\n')
   # print(classification_report(y_test,pred))
  
    np.set_printoptions(precision=2)
   
    return Grid_search_knn.best_estimator_

In [None]:
baseline(GaussianNB(),X_train_arr,X_val_arr,y_train_n,y_val_n)
print('\n----------after hyperparameter tuning----------\n')
model_gnb = naivebayes(X_train_arr,X_val_arr,y_train_n,y_val_n)

CV score with default parameters:0.2380318792058515
classification_report on test set with default parameters:

              precision    recall  f1-score   support

           0       0.40      0.21      0.28    184791
           1       0.23      0.34      0.27    184964
           2       0.21      0.59      0.31    184613
           3       0.23      0.02      0.04    184917
           4       0.20      0.03      0.05    185162

    accuracy                           0.24    924447
   macro avg       0.26      0.24      0.19    924447
weighted avg       0.26      0.24      0.19    924447


----------after hyperparameter tuning----------

Mean CV score is: 0.2380318792058515
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


In [None]:
baseline(DecisionTreeClassifier(),X_train_arr,X_val_arr,y_train_n,y_val_n)
print('\n----------after hyperparameter tuning----------\n')
model_dt = decisionTree(X_train_arr,X_val_arr,y_train_n,y_val_n)

In [None]:
baseline(KNeighborsClassifier(),X_train_arr,X_val_arr,y_train_n,y_val_n)
print('\n----------after hyperparameter tuning----------\n')
model_knn = KNN(X_train_arr,X_val_arr,y_train_n,y_val_n)

In [62]:
pickle.dump(model_gnb, open("model_gnb", 'wb'))
pickle.dump(model_dt, open("model_dt", 'wb'))
pickle.dump(model_knn, open("model_knn", 'wb'))



model_gnb.predict(X_test_arr)

array([0, 4, 0, ..., 1, 1, 1])

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))