In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import scipy as sc
import gc
import joblib

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 250)
pd.set_option('display.min_rows', 15)

In [2]:
def show_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB\n'.format(start_mem))

In [3]:
# Access "X" and "y" via split_X_y(df)["X"] & split_X_y(df)["y"]
def split_X_y(df):
    X = df.iloc[:,0:-1] # extracts all rows [:] and columns from 0 to next-to-last [0:-1]
    y = df.iloc[:,-1] # extracts all rows [:] and only last column [-1]  
    return {"X":X, "y":y}

In [4]:
# Read csv
csv_jun = r'E:\OneDrive\Arbeit\Repos\DMC2022\Kevin\csv\08_datasets_monthly_split_w0_to_nxt_month_labeled\220613_01_dataset_w0-to-nxt-month_labeled_jun.csv'
csv_jul = r'E:\OneDrive\Arbeit\Repos\DMC2022\Kevin\csv\08_datasets_monthly_split_w0_to_nxt_month_labeled\220613_02_dataset_w0-to-nxt-month_labeled_jul.csv'
csv_aug = r'E:\OneDrive\Arbeit\Repos\DMC2022\Kevin\csv\08_datasets_monthly_split_w0_to_nxt_month_labeled\220613_03_dataset_w0-to-nxt-month_labeled_aug.csv'
csv_sep = r'E:\OneDrive\Arbeit\Repos\DMC2022\Kevin\csv\08_datasets_monthly_split_w0_to_nxt_month_labeled\220613_04_dataset_w0-to-nxt-month_labeled_sep.csv'
csv_oct = r'E:\OneDrive\Arbeit\Repos\DMC2022\Kevin\csv\08_datasets_monthly_split_w0_to_nxt_month_labeled\220613_05_dataset_w0-to-nxt-month_labeled_oct.csv'
csv_nov = r'E:\OneDrive\Arbeit\Repos\DMC2022\Kevin\csv\08_datasets_monthly_split_w0_to_nxt_month_labeled\220613_06_dataset_w0-to-nxt-month_labeled_nov.csv'
csv_dec = r'E:\OneDrive\Arbeit\Repos\DMC2022\Kevin\csv\08_datasets_monthly_split_w0_to_nxt_month_labeled\220613_07_dataset_w0-to-nxt-month_labeled_dec.csv'
csv_jan = r'E:\OneDrive\Arbeit\Repos\DMC2022\Kevin\csv\08_datasets_monthly_split_w0_to_nxt_month_labeled\220613_08_dataset_w0-to-nxt-month_labeled_jan.csv'

csv_list = [csv_jun, csv_jul, csv_aug, csv_sep, csv_oct, csv_nov, csv_dec, csv_jan]

columns = ['userID', 
           'itemID', 
           'order', 
           'brand', 
           'feature_1', 
           'feature_2', 
           'feature_3', 
           'feature_4', 
           'feature_5',
           'categories', 
           'week']

dtype = {'userID':np.uint32,
          'itemID':np.uint32,
          'order':np.uint8,
          'brand':np.int16,
          'feature_1':np.int8,
          'feature_2':np.uint8,
          'feature_3':np.int16,
          'feature_4':np.int8,
          'feature_5':np.int16,
          'week':np.uint8}

nov = pd.read_csv(csv_nov, usecols=columns, sep='|', dtype=dtype, nrows=None)
dec = pd.read_csv(csv_dec, usecols=columns, sep='|', dtype=dtype, nrows=None)
jan = pd.read_csv(csv_jan, usecols=columns, sep='|', dtype=dtype, nrows=None)

train = pd.concat([nov, dec], axis=0)
test = jan

# Preprocessing

In [5]:
# Shrink dataframes with stratified labels

train = split_X_y(train)
test = split_X_y(test)

X_train = train["X"]
y_train = train["y"]
X_test = test["X"]
y_test = test["y"]

X_train1, X_train2, y_train1, y_train2 = train_test_split(X_train, y_train, test_size=0.5, stratify=y_train)
train = pd.concat([X_train1,y_train1], axis=1)

X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test)
test = pd.concat([X_test1,y_test1], axis=1)

In [6]:
def preprocess_df(df):
    # convert string to list of integers in 'categories'
    df["categories"] = df["categories"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    
    # add fake row with all categories from 0 to 4299 to later have all columns in multi-hot-encoding
    df.loc[len(df)] = [424242,424242,42, 42, 0, 0, 0, 0, 0, [cat for cat in range(0,4300)], 5]
    df.index = df.index + 1  # add index
    
    # multi-hot-encode categories
    cats = df["categories"]
    mlb = MultiLabelBinarizer(sparse_output=False) # Set to True if output binary array is desired in CSR sparse format
    df_multi_hot = pd.DataFrame(mlb.fit_transform(cats), columns=mlb.classes_, index=df.index, dtype=np.int8).astype(pd.SparseDtype(np.uint8,0))
    
    # join new binarized columns with rest of dataframe
    df = df.join(df_multi_hot, how='inner')
    if (len(df[df.isnull().any(axis=1)]) > 0):
        raise RuntimeError('Join of multi-hot-encoded categories probably created missing values.')
    df.drop(df.tail(1).index,inplace=True) # drop fake row
    
    # drop list of categories, since it's not needed anymore
    df.drop('categories', axis=1, inplace=True)
    
    # pop and append 'week' at end of dataframe
    col = df.pop("week")
    df.insert(len(df.columns), col.name, col)
    
    print(df.info())
    return df

# Modeling

In [7]:
def train_xgb(X, y):
    X_train = X
    y_train = y
    
    classifier = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
    model = classifier.fit(X_train, y_train)
    
    return model

In [8]:
def train_lgb(X, y):
    return

In [9]:
def predict_values(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # get accuracies
    model_train = accuracy_score(y_train, y_train_pred)
    model_test = accuracy_score(y_test, y_test_pred)

    print(f'\n XGboost train/test accuracies: '
         f'{model_train:.3f}/{model_test:.3f}')
    
    return {"train_pred":y_train_pred, "test_pred":y_test_pred}

In [10]:
def evaluate_pred(X, y, y_pred):
    # create dataframe from test-prediction with index from X_test
    df_y_pred = pd.DataFrame(y_pred, columns=['week_pred'], index=X.index, dtype=np.int8)

    # concatenate X, y, y_pred (put columns next to each other)
    df_eval = pd.concat([X, y, df_y_pred], axis=1)
    
    return df_eval

In [11]:
def execute_pipeline(train, test, train_method):
    train_pre = preprocess_df(train)
    test_pre  = preprocess_df(test)
    
    train_Xy = split_X_y(train_pre)
    test_Xy = split_X_y(test_pre)
    
    X_train = train_Xy["X"]
    y_train = train_Xy["y"]
    X_test = test_Xy["X"]
    y_test = test_Xy["y"]
    
    model = train_method(X_train, y_train)
    prediction = predict_values(model, X_train, y_train, X_test, y_test)
    
    return evaluate_pred(X_test, y_test, prediction["test_pred"])

In [12]:
test_prediction = execute_pipeline(train, test, train_xgb)
test_prediction

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1060436 entries, 1 to 778133
Columns: 4310 entries, userID to week
dtypes: Sparse[int32, 0](4300), int16(3), int8(2), uint32(2), uint8(3)
memory usage: 78.5 MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 451003 entries, 168028 to 475683
Columns: 4310 entries, userID to week
dtypes: Sparse[int32, 0](4300), int16(3), int8(2), uint32(2), uint8(3)
memory usage: 33.1 MB
None

 XGboost train/test accuracies: 0.835/0.817


Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,0,...,4292,4293,4294,4295,4296,4297,4298,4299,week,week_pred
168028,41021,1970,1,703,4,0,291,0,44,0,...,0,0,0,0,0,0,0,0,0,0
105185,7134,6169,1,1306,10,0,421,3,3,0,...,0,0,0,0,0,0,0,0,0,0
540295,43110,12500,2,389,4,0,474,0,-1,0,...,0,0,0,0,0,0,0,0,0,0
646731,35709,27433,1,772,10,0,503,0,17,0,...,0,0,0,0,0,0,0,0,0,0
307968,881,14486,1,1126,10,3,291,0,139,0,...,0,0,0,0,0,0,0,0,0,0
853669,36828,9895,1,484,10,2,503,0,17,0,...,0,0,0,0,0,0,0,0,4,0
456117,30895,19011,1,147,10,0,503,3,9,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
487271,40225,26545,1,225,4,0,395,0,163,0,...,0,0,0,0,0,0,0,0,0,0
289585,13851,29820,1,539,10,0,513,0,45,0,...,0,0,0,0,0,0,0,0,0,0


---

In [13]:
X_test

Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories
0,38769,3477,1,186,6,0,196,0,45,"[74, 4109, 3867, 803, 4053]"
1,42535,30474,1,193,10,3,229,3,132,"[3459, 3738, 679, 1628, 4072]"
2,42535,15833,1,1318,4,1,455,0,108,"[2973, 2907, 2749, 3357]"
3,42535,20131,1,347,4,0,291,3,44,"[30, 1515, 1760, 2932, 1287, 2615, 3727, 2450,..."
4,42535,4325,1,539,6,0,303,0,45,"[3104, 1772, 2029, 1274, 3915, 888, 1118, 3882..."
5,42535,12919,1,1338,10,0,26,0,39,"[813, 3949, 3961]"
6,29737,9139,1,703,10,0,413,3,3,"[626, 1995, 2896, 1605, 564, 3510, 1389, 2112,..."
...,...,...,...,...,...,...,...,...,...,...
902001,37716,14623,1,219,6,0,536,3,46,"[3078, 777, 2389, 3897]"
902002,42120,28205,8,854,10,0,502,0,124,"[3262, 3639]"


In [14]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(y_train)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [15]:
y_test

0         0
1         0
2         0
3         0
4         0
5         0
6         0
         ..
902001    4
902002    4
902003    4
902004    4
902005    4
902006    4
902007    4
Name: week, Length: 902008, dtype: uint8

In [16]:
eval = pd.DataFrame(y_test)

In [19]:
eval.loc[eval.week != 0]

Unnamed: 0,week
736684,1
736685,1
736686,1
736687,1
736688,1
736689,1
736690,1
...,...
902001,4
902002,4


# Manual execution w/o pipeline

In [18]:
train_pre = preprocess_df(train)
test_pre  = preprocess_df(test)

AttributeError: 'list' object has no attribute 'split'

In [None]:
train_Xy = split_X_y(train_pre)
test_Xy = split_X_y(test_pre)

X_train = train_Xy["X"]
y_train = train_Xy["y"]
X_test = test_Xy["X"]
y_test = test_Xy["y"]

In [None]:
%%time
del train_Xy
del test_Xy
gc.collect()
model = train_xgb(X_train, y_train)

In [None]:
prediction = predict_values(model, X_train, y_train, X_test, y_test)

In [None]:
evaluate_pred(X_test, y_test, prediction["test_pred"])

---