In [11]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import scipy as sc
import gc
import joblib

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 250)
pd.set_option('display.min_rows', 15)

gc.collect()

524

In [12]:
def show_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB\n'.format(start_mem))

In [13]:
# Access "X" and "y" via split_X_y(df)["X"] & split_X_y(df)["y"]
def split_X_y(df):
    X = df.iloc[:,0:-1] # extracts all rows [:] and columns from 0 to next-to-last [0:-1]
    y = df.iloc[:,-1] # extracts all rows [:] and only last column [-1]  
    return {"X":X, "y":y}

In [20]:
def preprocess_df(df):    
    # pop and append 'week' at end of dataframe
    col = df.pop("week")
    df.insert(len(df.columns), col.name, col)
    
    print(df.info())
    return df

In [14]:
def train_xgb(X, y):
    X_train = X
    y_train = y
    
    classifier = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
    model = classifier.fit(X_train, y_train)
    
    return model

In [15]:
def train_lgb(X, y):
    return

In [16]:
def predict_values(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # get accuracies
    model_train = accuracy_score(y_train, y_train_pred)
    model_test = accuracy_score(y_test, y_test_pred)

    print(f'\n XGboost train/test accuracies: '
         f'{model_train:.3f}/{model_test:.3f}')
    
    return {"train_pred":y_train_pred, "test_pred":y_test_pred}

In [17]:
def evaluate_pred(X, y, y_pred):
    # create dataframe from test-prediction with index from X_test
    df_y_pred = pd.DataFrame(y_pred, columns=['week_pred'], index=X.index, dtype=np.int8)

    # concatenate X, y, y_pred (put columns next to each other)
    df_eval = pd.concat([X, y, df_y_pred], axis=1)
    
    return df_eval

In [18]:
def execute_pipeline(train, test, train_method):
    train_pre = preprocess_df(train)
    test_pre  = preprocess_df(test)
    
    train_Xy = split_X_y(train_pre)
    test_Xy = split_X_y(test_pre)
    
    X_train = train_Xy["X"]
    y_train = train_Xy["y"]
    X_test = test_Xy["X"]
    y_test = test_Xy["y"]
    
    del train_pre
    del test_pre
    del train_Xy
    del test_Xy
    gc.collect()
    
    model = train_method(X_train, y_train)
    prediction = predict_values(model, X_train, y_train, X_test, y_test)
    
    return evaluate_pred(X_test, y_test, prediction["test_pred"])

# Preparation

In [29]:
# Read csv
csv_jun = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220613_01_dataset_w0-to-nxt-month_labeled_jun.csv'
csv_jul = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220613_02_dataset_w0-to-nxt-month_labeled_jul.csv'
csv_aug = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220613_03_dataset_w0-to-nxt-month_labeled_aug.csv'
csv_sep = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220613_04_dataset_w0-to-nxt-month_labeled_sep.csv'
csv_oct = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220613_05_dataset_w0-to-nxt-month_labeled_oct.csv'
csv_nov = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220613_06_dataset_w0-to-nxt-month_labeled_nov.csv'
csv_dec = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220613_07_dataset_w0-to-nxt-month_labeled_dec.csv'
csv_jan = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220613_08_dataset_w0-to-nxt-month_labeled_jan.csv'

csv_list = [csv_jun, csv_jul, csv_aug, csv_sep, csv_oct, csv_nov, csv_dec, csv_jan]

columns = ['userID', 
           'itemID', 
           'order', 
           'brand', 
           'feature_1', 
           'feature_2', 
           'feature_3', 
           'feature_4', 
           'feature_5', 
           'week']

dtype = {'userID':np.uint32,
          'itemID':np.uint32,
          'order':np.uint8,
          'brand':np.int16,
          'feature_1':np.int8,
          'feature_2':np.uint8,
          'feature_3':np.int16,
          'feature_4':np.int8,
          'feature_5':np.int16,
          'week':np.uint8}

dataframes = []
for i in range(0,6):
    dataframes.append(pd.read_csv(csv_list[i], usecols=columns, sep='|', dtype=dtype, nrows=None))

train = pd.concat([df for df in dataframes], axis=0)
test = pd.read_csv(csv_jan, usecols=columns, sep='|', dtype=dtype, nrows=None)

# Shrink dataframes with stratified labels

#train = split_X_y(train)
#test = split_X_y(test)

#X_train = train["X"]
#y_train = train["y"]
#X_test = test["X"]
#y_test = test["y"]

#X_train1, X_train2, y_train1, y_train2 = train_test_split(X_train, y_train, test_size=0.3, stratify=y_train)
#train = pd.concat([X_train1,y_train1], axis=1)

#X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test, y_test, test_size=0.3, stratify=y_test)
#test = pd.concat([X_test1,y_test1], axis=1)

# Prediction

In [30]:
test_prediction = execute_pipeline(train, test, train_xgb)
test_prediction

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2453038 entries, 0 to 671267
Data columns (total 10 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   userID     uint32
 1   itemID     uint32
 2   order      uint8 
 3   brand      int16 
 4   feature_1  int8  
 5   feature_2  uint8 
 6   feature_3  int16 
 7   feature_4  int8  
 8   feature_5  int16 
 9   week       uint8 
dtypes: int16(3), int8(2), uint32(2), uint8(3)
memory usage: 63.2 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 902008 entries, 0 to 902007
Data columns (total 10 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   userID     902008 non-null  uint32
 1   itemID     902008 non-null  uint32
 2   order      902008 non-null  uint8 
 3   brand      902008 non-null  int16 
 4   feature_1  902008 non-null  int8  
 5   feature_2  902008 non-null  uint8 
 6   feature_3  902008 non-null  int16 
 7   feature_4  902008 non-null  int8  
 8   feature_5  902008 non-

Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,week,week_pred
0,38769,3477,1,186,6,0,196,0,45,0,0
1,42535,30474,1,193,10,3,229,3,132,0,0
2,42535,15833,1,1318,4,1,455,0,108,0,0
3,42535,20131,1,347,4,0,291,3,44,0,0
4,42535,4325,1,539,6,0,303,0,45,0,0
5,42535,12919,1,1338,10,0,26,0,39,0,0
6,29737,9139,1,703,10,0,413,3,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...
902001,37716,14623,1,219,6,0,536,3,46,4,0
902002,42120,28205,8,854,10,0,502,0,124,4,0


---

# Evaluation

In [34]:
prediction = test_prediction
df_eval = evaluate_pred(X_test, y_test, prediction["week_pred"])

In [63]:
df_eval.loc[(df_eval['week'] != 0)].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165324 entries, 736684 to 902007
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   userID     165324 non-null  uint32
 1   itemID     165324 non-null  uint32
 2   order      165324 non-null  uint8 
 3   brand      165324 non-null  int16 
 4   feature_1  165324 non-null  int8  
 5   feature_2  165324 non-null  uint8 
 6   feature_3  165324 non-null  int16 
 7   feature_4  165324 non-null  int8  
 8   feature_5  165324 non-null  int16 
 9   week       165324 non-null  uint8 
 10  week_pred  165324 non-null  int8  
dtypes: int16(3), int8(3), uint32(2), uint8(3)
memory usage: 4.4 MB


In [65]:
df_eval.loc[(df_eval['week'] == df_eval['week_pred']) & (df_eval['week'] != 0)].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 181 entries, 810362 to 901995
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userID     181 non-null    uint32
 1   itemID     181 non-null    uint32
 2   order      181 non-null    uint8 
 3   brand      181 non-null    int16 
 4   feature_1  181 non-null    int8  
 5   feature_2  181 non-null    uint8 
 6   feature_3  181 non-null    int16 
 7   feature_4  181 non-null    int8  
 8   feature_5  181 non-null    int16 
 9   week       181 non-null    uint8 
 10  week_pred  181 non-null    int8  
dtypes: int16(3), int8(3), uint32(2), uint8(3)
memory usage: 4.9 KB


In [72]:
print(f'{181/165324*100:.3f} %')

0.109 %


---