# AI Training with all models 


In [1]:
#Imports:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import autokeras as ak
import xgboost as xgb
import catboost as cat
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE 
from tensorflow.keras.models import load_model
from sklearn.utils import resample,shuffle
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn import neural_network


In [2]:
# Methods to test the predictions

def count_points(pred, gold):
    df = pd.merge(pred, gold, on=['userID', 'itemID'], suffixes=('_pred', '_gold'))
    df['points'] = df.apply(_compute_points_for_row, axis=1)
    return df['points'].sum()

def _compute_points_for_row(row):
    y_pred, y_gold = row.prediction_pred, row.prediction_gold
    if y_pred == y_gold:
        # one point if "no order" (0) is predicted correctly; three points if order week is predicted correctly
        return 1 if y_pred == 0 else 3
    # one point if order is predicted correctly (but not the correct week), otherwise zero points
    return 1 if (y_pred > 0 and y_gold > 0) else 0

def preprocessData(df):
    labels = df["label"] # generating a label dataset
    data = df.drop(['label'], axis = 1) #dropping label
    data = df.drop(['order'], axis = 1) #dropping order
    data = df.drop(['date'], axis = 1) #dropping date
    
    enc = OneHotEncoder(handle_unknown='ignore') #onehotencoder
    cols = ["feature_1", "feature_2", "feature_4"] #columns to onehot encode
    enc.fit(data[cols])
    encoder_df = pd.DataFrame(enc.transform(data[cols]).toarray()) #encoding
    data = data[["userID", "itemID", "brand", "feature_3", "feature_5" 
                 ,]].join(encoder_df) #merging
                 #"date_of_month", "total_purchase_times","purchased_frequency","category", "weekday",]].join(encoder_df) #merging
    
    lb = preprocessing.LabelBinarizer()
    #lenc = enc.fit(np_labels)
    lb = lb.fit(labels.to_numpy())
    labels_enc = lb.transform(labels.to_numpy())
    
    return labels_enc, data, lb, enc

In [8]:
#train_dec = pd.read_csv("train_bef_dec.csv")
train_dec = pd.read_csv("orders_before_dec_labeled.csv", sep = ",")

df_items = pd.read_csv("items.csv", sep = "|") #items

train_dec = pd.merge(train_dec, df_items, on="itemID") #merging the dataset on itemID 

print(len(train_dec))

y_train, x_train, binarizer, enc = preprocessData(train_dec)

x_train.head()


767131


Unnamed: 0,userID,itemID,brand,feature_3,feature_5,0,1,2,3,4,...,12,13,14,15,16,17,18,19,20,21
0,15215,19979,724,503,17,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,39567,19979,724,503,17,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,12385,19979,724,503,17,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,27356,19979,724,503,17,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,26707,19979,724,503,17,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [4]:
test_dec = pd.read_csv("test_dec.csv")
test_dec = test_dec.sort_values(by=["userID"])
print(len(test_dec))
y_test, x_test, binarizer = preprocessData(train_dec)

x_train.head()

113478


ValueError: too many values to unpack (expected 3)

In [5]:
pred_dec = pd.read_csv("prediction_dec.csv", sep = ",")
print(len(pred_dec))
pred_dec.head()

21817


Unnamed: 0,date,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,category,weekday,date_of_month,total_purchase_times,purchased_frequency,prediction,Unnamed: 16,Unnamed: 17,Unnamed: 18
0,2020/12/11,0,20664,1,408,4,0,284,0,66,2346,4,11,14,0.71,,,,
1,2020/12/11,17516,3148,1,408,4,0,284,0,66,2346,4,11,14,0.71,,,,
2,2020/12/11,0,28231,1,193,4,3,468,3,108,3898,4,11,14,0.19,,,,
3,2020/12/30,0,28231,5,1496,10,0,348,0,95,3898,2,30,49,19.65,,,,
4,2020/12/28,0,28231,1,186,4,0,28,0,81,3898,0,28,33,3.9,,,,


In [6]:
name = 'submission_dec'

df_submission = pd.read_csv( name+".csv", sep = "|")

result_pred = pd.merge(df_submission, df_items, on="itemID")

result_pred = result_pred.drop(['categories'], axis = 1)
data = result_pred.drop(['prediction'], axis = 1)
cols = ["feature_1", "feature_2", "feature_4"]

encoder_df = pd.DataFrame(enc.transform(data[cols]).toarray())
submission = data[["userID","itemID", "brand", "feature_3", "feature_5"]].join(encoder_df)

submission

Unnamed: 0,userID,itemID,brand,feature_3,feature_5,0,1,2,3,4,...,12,13,14,15,16,17,18,19,20,21
0,0,20664,408,284,66,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,23363,20664,408,284,66,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0,28231,193,468,108,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,13,2690,406,491,66,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,15,1299,1056,474,108,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9740,46049,17984,449,207,45,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9741,46069,29992,280,484,44,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9742,46117,8847,143,46,69,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9743,46124,19677,1006,491,154,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [9]:
sm = SMOTE(random_state=42)

x_train, y_train=  sm.fit_resample(x_train, y_train)

X_train, X_test, y_train, y_test= train_test_split(x_train, y_train, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val= train_test_split(X_train, y_train, test_size=0.3, random_state=42)

print(len(X_train.shape), len(X_test.shape), len(X_val.shape))




2 2 2


In [11]:
X_train_arr = X_train.to_numpy()
X_val_arr = X_val.to_numpy()
X_test_arr = X_test.to_numpy()

y_val_n = binarizer.inverse_transform(y_val)
y_train_n = binarizer.inverse_transform(y_train)
y_test_n = binarizer.inverse_transform(y_test)

## Models

In [12]:
# Autokeras
model = ak.AutoModel(
    inputs=[ak.StructuredDataInput()],
    outputs=[ak.ClassificationHead()],
    max_trials=5,
    overwrite = True
)

model.fit(
    x=X_train_arr, y=y_train, epochs=100, batch_size=64, validation_data= [X_val_arr, y_val]
)


res_train_ak = model.predict(X_train_arr)
res_test_ak = model.predict(X_test_arr)
res_val_ak = model.predict(X_val_arr)


model = model.export_model()


try:
    model.save("model_autokeras_dec_class", save_format="tf")
except Exception:
    model.save("model_autokeras_dec_class.h5")


Search: Running Trial #1

Hyperparameter    |Value             |Best Value So Far 
structured_data...|False             |?                 
structured_data...|False             |?                 
structured_data...|2                 |?                 
structured_data...|32                |?                 
structured_data...|0                 |?                 
structured_data...|32                |?                 
classification_...|0                 |?                 
optimizer         |adam              |?                 
learning_rate     |0.001             |?                 

Epoch 1/100
Epoch 2/100
Epoch 3/100
 2817/28637 [=>............................] - ETA: 39s - loss: 1.3853 - accuracy: 0.3651

KeyboardInterrupt: 

In [65]:
def baseline(classifier,x_train_dt,x_test_dt,y_train_dt,y_test_dt):
    baseline=classifier
    baseline.fit(x_train_dt,y_train_dt)
    stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    pred=baseline.predict(x_test_dt)
    print('CV score with default parameters:{}'.format(cross_val_score(classifier,x_train_dt, y_train_dt, cv=stratified_10_fold_cv, scoring='accuracy').mean()))
    print('classification_report on test set with default parameters:\n')
    print(classification_report(y_test_dt,pred))
    # cnf_matrix = confusion_matrix(y_test_dt,pred)    
    #np.set_printoptions(precision=2)
   # plot_confusion_matrix(cnf_matrix,classes=labels,title='confusion matrix:default')
    return baseline


def naivebayes (x_train,x_test,y_train,y_test):
    gnb=GaussianNB()
    gnb.fit(x_train, y_train)
    stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    accuracy=cross_val_score(gnb,x_train,y_train,cv=cv)
    print('Mean CV score is: {}'.format(accuracy.mean()))
    params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
    grid_search = GridSearchCV( estimator=gnb,param_grid=params_NB, cv=cv,verbose=1, scoring='accuracy') 
    grid_search.fit(x_train, y_train)
    print("The best parameter is:{}".format(grid_search.best_params_))
    print("The best validation score is:{}".format(grid_search.best_score_))
    gnb.set_params(var_smoothing=grid_search.best_params_)
    prediction = gnb.predict(x_test)
    print(classification_report(y_test, prediction))
    cm=confusion_matrix(y_test,prediction)    
    
    return gnb

def decisionTree(x_train,x_test,y_train,y_test):
    
    parameters={
            'criterion':['gini','entropy'],
            'max_depth':[2,3,4,5,6,7,8]
            }
    dtree=DecisionTreeClassifier()
    stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    grid_search=GridSearchCV(dtree,parameters,scoring='accuracy',cv=stratified_10_fold_cv)
    grid_search.fit(x_train,y_train) 
    print('CV score with best parameters:{}'.format(cross_val_score(grid_search.best_estimator_,x_train, y_train, cv=stratified_10_fold_cv, scoring='accuracy').mean()))
    print('best parameters:{}'.format(grid_search.best_params_))
    pred=grid_search.predict(x_test)
    print('classification_report with best parameters:\n')
    print(classification_report(y_test,pred))
    #confusion matrix
    dt_cnf_matrix = confusion_matrix(y_test,pred)    
    np.set_printoptions(precision=2)    
    
    return grid_search.best_estimator_

def KNN(x_train,x_test,y_train,y_test):
    
    parameters={
           'n_neighbors' : list(range(1,30)),
        'p': [1,2],
        'weights' : ["uniform", "distance"]
            }
    knn_2 = KNeighborsClassifier()
    stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    Grid_search_knn = GridSearchCV(knn_2, parameters, cv=stratified_10_fold_cv)
    best_model = Grid_search_knn.fit(x_test,y_test)
    #Print The value of best Hyperparameters
    cv_score=cross_val_score(Grid_search_knn.best_estimator_,x_train, y_train, cv=stratified_10_fold_cv, scoring='accuracy').mean()
    print("Mean CV score is {} with params {}".format(cv_score, Grid_search_knn.best_params_))
    pred=Grid_search_knn.predict(x_test)
    #print('classification_report on test set with best parameters:\n')
   # print(classification_report(y_test,pred))
  
    np.set_printoptions(precision=2)
   
    return Grid_search_knn.best_estimator_

In [66]:
baseline(GaussianNB(),X_train_arr,X_val_arr,y_train_n,y_val_n)
print('\n----------after hyperparameter tuning----------\n')
model_gnb = naivebayes(X_train_arr,X_val_arr,y_train_n,y_val_n)

CV score with default parameters:0.24514695262821093
classification_report on test set with default parameters:

              precision    recall  f1-score   support

           0       0.50      0.16      0.25    156852
           1       0.25      0.38      0.30    157533
           2       0.23      0.12      0.16    157054
           3       0.23      0.27      0.25    156753
           4       0.21      0.29      0.24    157280

    accuracy                           0.25    785472
   macro avg       0.28      0.25      0.24    785472
weighted avg       0.28      0.25      0.24    785472


----------after hyperparameter tuning----------

Mean CV score is: 0.24514695262821093
Fitting 10 folds for each of 100 candidates, totalling 1000 fits
The best parameter is:{'var_smoothing': 1e-07}
The best validation score is:0.2612253821002099
              precision    recall  f1-score   support

           0       0.50      0.16      0.25    156852
           1       0.25      0.38      0.

In [67]:
model_dt = baseline(DecisionTreeClassifier(),X_train_arr,X_val_arr,y_train_n,y_val_n)
print('\n----------after hyperparameter tuning----------\n')
#model_dt = decisionTree(X_train_arr,X_val_arr,y_train_n,y_val_n)

CV score with default parameters:0.9262289888887334
classification_report on test set with default parameters:

              precision    recall  f1-score   support

           0       0.95      0.95      0.95    156852
           1       0.95      0.96      0.95    157533
           2       0.92      0.93      0.92    157054
           3       0.91      0.91      0.91    156753
           4       0.92      0.91      0.91    157280

    accuracy                           0.93    785472
   macro avg       0.93      0.93      0.93    785472
weighted avg       0.93      0.93      0.93    785472


----------after hyperparameter tuning----------



In [None]:
#model_knn = baseline(KNeighborsClassifier(),X_train_arr,X_val_arr,y_train_n,y_val_n)
#print('\n----------after hyperparameter tuning----------\n')
#model_knn = KNN(X_train_arr,X_val_arr,y_train_n,y_val_n)

In [85]:
pickle.dump(model_gnb, open("model_gnb", 'wb'))
pickle.dump(model_dt, open("model_dt", 'wb'))
#pickle.dump(model_knn, open("model_knn", 'wb'))



model_gnb.predict(X_test_arr)

array([0, 4, 0, ..., 1, 1, 1])

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))

In [70]:
print("Calculate")

res_ak = model.predict(X_test_arr)
res_gnb = model_gnb.predict(X_test_arr)
res_dt = model_dt.predict(X_test_arr)

Stacking_set = pd.DataFrame()
Stacking_set["res_ak"] = res_ak
Stacking_set["res_gnb"] = res_gnb
Stacking_set["res_dt"] = res_dt
Stacking_set["label"] = y_test

y_test

Calculate


NotFoundError: ./auto_model/best_pipeline; No such file or directory

In [None]:
print("jojo")