In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import scipy as sc
import gc
import joblib

import xgboost as xgb
from xgboost import XGBClassifier

import lightgbm as lgb

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 250)
pd.set_option('display.min_rows', 15)

In [2]:
def show_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB\n'.format(start_mem))

In [3]:
# Read csv
csv_jun = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220613_01_dataset_w0-to-nxt-month_labeled_jun.csv'
csv_jul = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220613_02_dataset_w0-to-nxt-month_labeled_jul.csv'
csv_aug = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220613_03_dataset_w0-to-nxt-month_labeled_aug.csv'
csv_sep = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220613_04_dataset_w0-to-nxt-month_labeled_sep.csv'
csv_oct = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220613_05_dataset_w0-to-nxt-month_labeled_oct.csv'
csv_nov = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220613_06_dataset_w0-to-nxt-month_labeled_nov.csv'
csv_dec = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220613_07_dataset_w0-to-nxt-month_labeled_dec.csv'
csv_jan = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220613_08_dataset_w0-to-nxt-month_labeled_jan.csv'

csv_list = [csv_jun, csv_jul, csv_aug, csv_sep, csv_oct, csv_nov, csv_dec, csv_jan]

columns = ['userID', 
           'itemID', 
           'order', 
           'brand', 
           'feature_1', 
           'feature_2', 
           'feature_3', 
           'feature_4', 
           'feature_5',
           'categories', 
           'week']

dtype = {'userID':np.uint32,
          'itemID':np.uint32,
          'order':np.uint8,
          'brand':np.int16,
          'feature_1':np.int8,
          'feature_2':np.uint8,
          'feature_3':np.int16,
          'feature_4':np.int8,
          'feature_5':np.int16,
          'week':np.uint8}

nov = pd.read_csv(csv_nov, usecols=columns, sep='|', dtype=dtype, nrows=None)
dec = pd.read_csv(csv_dec, usecols=columns, sep='|', dtype=dtype, nrows=None)

train = pd.concat([nov, dec], axis=0)
test = pd.read_csv(csv_jun, usecols=columns, sep='|', dtype=dtype, nrows=None)

# Preprocessing

In [4]:
def preprocess_df(df):
    # convert string to list of integers in 'categories'
    df["categories"] = df["categories"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    
    # add fake row with all categories from 0 to 4299 to later have all columns in multi-hot-encoding
    df.loc[len(df)] = [424242,424242,42, 42, 0, 0, 0, 0, 0, [cat for cat in range(0,4300)], 5]
    df.index = df.index + 1  # add index
    
    # multi-hot-encode categories
    cats = df["categories"]
    mlb = MultiLabelBinarizer(sparse_output=False) # Set to True if output binary array is desired in CSR sparse format
    df_multi_hot = pd.DataFrame(mlb.fit_transform(cats), columns=mlb.classes_, index=df.index, dtype=np.int8).astype(pd.SparseDtype(np.uint8,0))
    
    # join new binarized columns with rest of dataframe
    df = df.join(df_multi_hot, how='inner')
    if (len(df[df.isnull().any(axis=1)]) > 0):
        raise RuntimeError('Join of multi-hot-encoded categories probably created missing values.')
    df.drop(df.tail(1).index,inplace=True) # drop fake row
    
    # drop list of categories, since it's not needed anymore
    df.drop('categories', axis=1, inplace=True)
    
    # pop and append 'week' at end of dataframe
    col = df.pop("week")
    df.insert(len(df.columns), col.name, col)
    
    print(df.info())
    return df

### Datatypes for XGBoost

XGBoost natively supports continuous data but not categorical data. In order to use categorical data with XGBoost, we have to use One-Hot-Encoding which converts a column of categorical values into muliple columns of binary values.

# Modeling

In [5]:
# Access "X" and "y" via split_X_y(df)["X"] & split_X_y(df)["y"]
def split_X_y(df):
    X = df.iloc[:,0:-1] # extracts all rows [:] and columns from 0 to next-to-last [0:-1]
    y = df.iloc[:,-1] # extracts all rows [:] and only last column [-1]  
    return {"X":X, "y":y}

In [6]:
def train_xgb(X, y):
    X_train = X
    y_train = y
    
    classifier = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
    model = classifier.fit(X_train, y_train)
    
    return model

In [7]:
def train_lgb(X, y):
    return

In [8]:
def predict_values(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # get accuracies
    model_train = accuracy_score(y_train, y_train_pred)
    model_test = accuracy_score(y_test, y_test_pred)

    print(f'\n XGboost train/test accuracies: '
         f'{model_train:.3f}/{model_test:.3f}')
    
    return {"train_pred":y_train_pred, "test_pred":y_test_pred}

In [9]:
def evaluate_pred(X, y, y_pred):
    # create dataframe from test-prediction with index from X_test
    df_y_pred = pd.DataFrame(y_pred, columns=['week_pred'], index=X.index, dtype=np.int8)

    # concatenate X, y, y_pred (put columns next to each other)
    df_eval = pd.concat([X, y, df_y_pred], axis=1)
    
    return df_eval

In [10]:
def execute_pipeline(train, test, train_method):
    train_pre = preprocess_df(train)
    test_pre  = preprocess_df(test)
    
    train_Xy = split_X_y(train_pre)
    test_Xy = split_X_y(test_pre)
    
    X_train = train_Xy["X"]
    y_train = train_Xy["y"]
    X_test = test_Xy["X"]
    y_test = test_Xy["y"]
    
    model = train_method(X_train, y_train)
    prediction = predict_values(model, X_train, y_train, X_test, y_test)
    
    return evaluate_pred(X_test, y_test, y_pred)

---

In [11]:
train_pre = preprocess_df(train)
test_pre  = preprocess_df(test)

KeyboardInterrupt: 

In [24]:
train_Xy = split_X_y(train_pre)
test_Xy = split_X_y(test_pre)

X_train = train_Xy["X"]
y_train = train_Xy["y"]
X_test = test_Xy["X"]
y_test = test_Xy["y"]

In [35]:
%%time
del train_Xy
del test_Xy
gc.collect()
model = train_model(X_train, y_train)

MemoryError: Unable to allocate 89.6 GiB for an array with shape (4309, 2791939) and data type int64

In [None]:
prediction = predict_values(model, X_train, y_train, X_test, y_test)

In [None]:
evaluate_pred(X_test, y_test, y_pred)

---