##### XGBoost on GPU

### Methods & Settings

In [22]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
import gc
import joblib

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

#from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:75% !important; }</style>"))

pd.set_option('display.max_rows', 250)
pd.set_option('display.min_rows', 25)

####
# Plays sinus
def playSound():
    from IPython.lib.display import Audio
    framerate = 4410
    play_time_seconds = 3

    t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)
    audio_data = np.sin(2*np.pi*300*t) + np.sin(2*np.pi*200*t)
    return Audio(audio_data, rate=framerate, autoplay=True) # plays 3sec sound, when done
    

####
# prints memory usage
def show_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB\n'.format(start_mem))
    return

####
# seperates features from label (y must be last column)
def sep_X_y(df_train, df_test):
    X_train = df_train.iloc[:,0:-1] # extracts all rows [:] and columns from 0 to next-to-last [0:-1]
    y_train = df_train.iloc[:,-1] # extracts all rows [:] and only last column [-1]
    X_test = df_test.iloc[:,0:-1]
    y_test = df_test.iloc[:,-1]
    
    return [X_train, y_train, X_test, y_test]

####
# split training and test set from given dataframe with dates as boundaries
def dt_train_test_split(df, dt_start_train, dt_end_train, dt_start_test, dt_end_test):
    print('Splitting dataframe...\n')
    # get indices from desired boundaries
    idx_start_train = df.date.searchsorted(pd.to_datetime(dt_start_train), side='left') # list needs to be sorted already for searchsorted
    idx_end_train = df.date.searchsorted(pd.to_datetime(dt_end_train) + pd.Timedelta(days=1), side='left')
    idx_start_test = df.date.searchsorted(pd.to_datetime(dt_start_test), side='left')
    idx_end_test = df.date.searchsorted(pd.to_datetime(dt_end_test) + pd.Timedelta(days=1), side='left')
    
    train = df.iloc[idx_start_train:idx_end_train]
    test = df.iloc[idx_start_test:idx_end_test]
    
    train.drop(columns=['date'], axis=0, inplace=True)
    test.drop(columns=['date'], axis=0, inplace=True)
    
    return sep_X_y(train, test)

####
# split training and test set from given dataframe with month as boundaries
def mth_train_test_split(df, mth_start_train, mth_end_train, mth_start_test, mth_end_test):
    print('Splitting dataframe...\n')
    
    # get indices from desired boundaries
    idx_start_train = df.month.searchsorted(mth_start_train, side='left') # list needs to be sorted already for searchsorted
    idx_end_train = df.month.searchsorted(mth_end_train + 1, side='left')
    idx_start_test = df.month.searchsorted(mth_start_test, side='left')
    idx_end_test = df.month.searchsorted(mth_end_test + 1, side='left')
    
    train = df.iloc[idx_start_train:idx_end_train]
    test = df.iloc[idx_start_test:idx_end_test]
    
    #train.drop(columns=['date'], axis=0, inplace=True)
    #test.drop(columns=['date'], axis=0, inplace=True)
    
    return sep_X_y(train, test)

####
# trains XGB model (classifier)
def train_xgb(X, y):
    X_train = X
    y_train = y
    
    print('Fitting model...\n')
    model = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
    fitted_model = model.fit(X_train, y_train)
    
    print('Plotting feature importance for "gain".')
    print('https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27\n')
    xgb.plot_importance(model, importance_type='gain')
    plt.show()
    
    # GRAPHVIZ (software + pip package) needed for tree plotting
    #fig, ax = plt.subplots(figsize=(30, 30))
    #xgb.plot_tree(model, num_trees=0, ax=ax, rankdir='LR')
    #plt.show()
    
    return fitted_model

####
# predicts labels of training and test with given model
def predict_values(model, X_train, y_train, X_test, y_test):
    print('Predicting values...\n')
    # predict y values
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # get accuracies
    model_train = accuracy_score(y_train, y_train_pred)
    model_test = accuracy_score(y_test, y_test_pred)

    # print info about accuracies
    print(f'\n XGboost train/test accuracies: '
         f'{model_train:.3f}/{model_test:.3f}')
    
    # return predicted values
    return [y_train_pred, y_test_pred]

####
# concatenates prediction with actual target for evaluation
def evaluate_pred(X, y, y_pred):
    # create dataframe from test-prediction with index from X_test
    df_y_pred = pd.DataFrame(y_pred, columns=['nextBuyIn_pred'], index=X.index, dtype=np.int8)

    # concatenate X, y, y_pred (put columns next to each other)
    df_eval = pd.concat([X, y, df_y_pred], axis=1)
    
    return df_eval

####
# executes all needed functions of the above with given training and test data and provided train method
def execute_pipeline(train_method, df, list_of_four_df_boundaries):
    b = list_of_four_df_boundaries
    # split dataframe in train/test and X/y
    X_train, y_train, X_test, y_test = dt_train_test_split(df, b[0], b[1], b[2], b[3])
    
    #train model
    model = train_method(X_train, y_train)    
    
    # make predictions
    pred_train, pred_test = predict_values(model, X_train, y_train, X_test, y_test)
    
    print('\nExecuted pipeline.\nEvaluate with "evaluate_pred(X, y, y_pred)"\n')
    return [pred_train, pred_test, X_train, y_train, X_test, y_test]

In [23]:
%%time
cols_item_feat = [
    'itemID',
    'brand',
    'feature_1',
    'feature_2',
    'feature_3',
    'feature_4',
    'feature_5',
    #'brandOrderRatio',
    #'feature1OrderRatio',
    #'feature2OrderRatio',
    #'feature3OrderRatio',
    #'feature4OrderRatio',
    #'feature5OrderRatio',
    #'TotalBFscore',
    #'RCP'
]

cols_user_feat = [
    'date',
    'userID',
    'itemID',
    'order',
    #'TotalItemOrders(user)',
    'MeanDiffToNxt(user)'
]

base = r'C:\Users\LEAND\Coding\Jupyter Notebooks\parquet\01_every_day_all-possible-submiss-purchases_labeled.parquet'
df_base = pd.read_parquet(base, engine='pyarrow')
df_base

CPU times: total: 562 ms
Wall time: 140 ms


Unnamed: 0,date,userID,itemID,week,year,month,dayOfMonth,weekOfYear,dayOfYear
0,2020-06-01,0,20664,0,2020,6,1,23,153
1,2020-06-01,0,28231,0,2020,6,1,23,153
2,2020-06-01,13,2690,0,2020,6,1,23,153
3,2020-06-01,15,1299,0,2020,6,1,23,153
4,2020-06-01,15,20968,0,2020,6,1,23,153
5,2020-06-01,20,8272,0,2020,6,1,23,153
6,2020-06-01,24,11340,0,2020,6,1,23,153
7,2020-06-01,34,21146,0,2020,6,1,23,153
8,2020-06-01,34,31244,0,2020,6,1,23,153
9,2020-06-01,46,31083,0,2020,6,1,23,153


In [24]:
# %%time

# df_base['year'] = df_base['date'].dt.year
# df_base['month'] = df_base['date'].dt.month
# df_base['dayOfMonth'] = df_base['date'].dt.day
# df_base['weekOfYear'] = df_base['date'].dt.weekofyear
# df_base['dayOfYear'] = df_base['date'].dt.dayofyear

In [25]:
# p_base = r'C:\Users\LEAND\Coding\Jupyter Notebooks\parquet\01_every_day_all-possible-submiss-purchases_labeled.parquet'
# df_base.to_parquet(p_base, engine='pyarrow', compression='snappy', index=None)

In [14]:
# feat_train = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220625_complete_feature-list_orderhistory_unlabeled_for-train-jun-dec.csv'
# feat_test = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220625_complete_feature-list_orderhistory_unlabeled_for-test-jan.csv'

# df_u_feat_train = pd.read_csv(feat_train, usecols=cols_user_feat, sep='|', converters={'date':pd.to_datetime}) # features that need to be joined on user
# df_i_feat_train = pd.read_csv(feat_train, usecols=cols_item_feat, sep='|', converters={'date':pd.to_datetime}) # features that need to be joined on item

# df_u_feat_test = pd.read_csv(feat_test, usecols=cols_user_feat, sep='|', converters={'date':pd.to_datetime}) # features that need to be joined on user
# df_i_feat_test = pd.read_csv(feat_test, usecols=cols_item_feat, sep='|', converters={'date':pd.to_datetime}) # features that need to be joined on item

# # --------------------------

u_feat_train = r'C:\Users\LEAND\Coding\Jupyter Notebooks\parquet\02_featureList_user_train.parquet'
i_feat_train = r'C:\Users\LEAND\Coding\Jupyter Notebooks\parquet\03_featureList_item_train.parquet'
u_feat_test = r'C:\Users\LEAND\Coding\Jupyter Notebooks\parquet\04_featureList_user_test.parquet'
i_feat_test = r'C:\Users\LEAND\Coding\Jupyter Notebooks\parquet\05_featureList_item_test.parquet'

df_u_feat_train = pd.read_parquet(u_feat_train, engine='pyarrow') # features that need to be joined on user
df_i_feat_train = pd.read_parquet(i_feat_train, engine='pyarrow') # features that need to be joined on item

df_u_feat_test = pd.read_parquet(u_feat_test, engine='pyarrow') # features that need to be joined on user
df_i_feat_test = pd.read_parquet(i_feat_test, engine='pyarrow') # features that need to be joined on item

In [15]:
# p_df_u_feat_train = r'C:\Users\LEAND\Coding\Jupyter Notebooks\parquet\02_featureList_user_train.parquet'
# p_df_i_feat_train = r'C:\Users\LEAND\Coding\Jupyter Notebooks\parquet\03_featureList_item_train.parquet'
# p_df_u_feat_test = r'C:\Users\LEAND\Coding\Jupyter Notebooks\parquet\04_featureList_user_test.parquet'
# p_df_i_feat_test = r'C:\Users\LEAND\Coding\Jupyter Notebooks\parquet\05_featureList_item_test.parquet'


# df_u_feat_train.to_parquet(p_df_u_feat_train, engine='pyarrow', compression='snappy', index=None)
# df_i_feat_train.to_parquet(p_df_i_feat_train, engine='pyarrow', compression='snappy', index=None)
# df_u_feat_test.to_parquet(p_df_u_feat_test, engine='pyarrow', compression='snappy', index=None)
# df_i_feat_test.to_parquet(p_df_i_feat_test, engine='pyarrow', compression='snappy', index=None)

In [26]:
idx_start_train = df_base.date.searchsorted(pd.to_datetime('2020-06-01'), side='left') # list needs to be sorted already for searchsorted
idx_end_train = df_base.date.searchsorted(pd.to_datetime('2020-12-31') + pd.Timedelta(days=1), side='left')
idx_start_test = df_base.date.searchsorted(pd.to_datetime('2021-01-01'), side='left')
idx_end_test = df_base.date.searchsorted(pd.to_datetime('2021-01-31') + pd.Timedelta(days=1), side='left')

train = df_base.iloc[idx_start_train:idx_end_train]
test = df_base.iloc[idx_start_test:idx_end_test]

In [28]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310004 entries, 2140029 to 2450032
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   date        310004 non-null  datetime64[ns]
 1   userID      310004 non-null  int64         
 2   itemID      310004 non-null  int64         
 3   week        310004 non-null  int64         
 4   year        310004 non-null  int64         
 5   month       310004 non-null  int64         
 6   dayOfMonth  310004 non-null  int64         
 7   weekOfYear  310004 non-null  int64         
 8   dayOfYear   310004 non-null  int64         
dtypes: datetime64[ns](1), int64(8)
memory usage: 21.3 MB


In [29]:
# train.to_csv(r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220626_train_no-feat.csv', sep='|', index=False)
# test.to_csv(r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220626_test_no-feat.csv', sep='|', index=False)

In [18]:
orders_train = df_u_feat_train.copy()
orders_train.drop([#'TotalItemOrders(user)',
    'MeanDiffToNxt(user)'], axis=1, inplace=True)

df_u_feat_train.drop(['date','order'], axis=1, inplace=True)

train = train.merge(df_u_feat_train, how='left', on=['userID', 'itemID'])
train = train.merge(orders_train, how='left', on=['date', 'userID', 'itemID'])
train.fillna(0)

del df_u_feat_train
del orders_train
gc.collect()

###
# df_i_feat_train.drop(['brandOrderRatio',
#     'feature1OrderRatio',
#     'feature2OrderRatio',
#     'feature3OrderRatio',
#     'feature4OrderRatio',
#     'feature5OrderRatio',
#     'TotalBFscore'], axis=1, inplace=True)
###

train = train.merge(df_i_feat_train, how='left', on='itemID')

train

MemoryError: Unable to allocate 34.1 GiB for an array with shape (4583279154,) and data type int64

Unnamed: 0,date,userID,itemID,week,year,month,dayOfMonth,weekOfYear,dayOfYear,TotalItemOrders(user),MeanDiffToNxt(user),order
1369,2020-06-01,2467,28140,1,2020,6,1,23,153,4.0,62.000000,2.0
1370,2020-06-01,2467,28140,1,2020,6,1,23,153,4.0,62.000000,2.0
1893,2020-06-01,3298,23810,1,2020,6,1,23,153,6.0,77.500000,2.0
1894,2020-06-01,3298,23810,1,2020,6,1,23,153,6.0,77.500000,2.0
1895,2020-06-01,3298,23810,1,2020,6,1,23,153,6.0,77.500000,2.0
2096,2020-06-01,3676,2329,1,2020,6,1,23,153,21.0,35.166667,2.0
2097,2020-06-01,3676,2329,1,2020,6,1,23,153,21.0,35.166667,2.0
2098,2020-06-01,3676,2329,1,2020,6,1,23,153,21.0,35.166667,2.0
2099,2020-06-01,3676,2329,1,2020,6,1,23,153,21.0,35.166667,2.0
2100,2020-06-01,3676,2329,1,2020,6,1,23,153,21.0,35.166667,2.0


KeyError: "['date', 'order'] not found in axis"

In [20]:
X_train, y_train, X_test, y_test = mth_train_test_split(df, 6, 10, 11, 11)

Splitting dataframe...



In [25]:
save_Xtrain = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220621_02_everyCombinationPerDay_basicFeatures_labeled_XTrain.parquet'
save_Xtest = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220621_02_everyCombinationPerDay_basicFeatures_labeled_XTest.parquet'
save_ytrain = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220621_02_everyCombinationPerDay_basicFeatures_labeled_yTrain.parquet'
save_ytest = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220621_02_everyCombinationPerDay_basicFeatures_labeled_yTest.parquet'

X_train.to_parquet(save_Xtrain, engine='pyarrow', compression='snappy', index=None)
pd.DataFrame(y_train).to_parquet(save_ytrain, engine='pyarrow', compression='snappy', index=None)
X_test.to_parquet(save_Xtest, engine='pyarrow', compression='snappy', index=None)
pd.DataFrame(y_test).to_parquet(save_ytest, engine='pyarrow', compression='snappy', index=None)

In [26]:
del df
gc.collect()

1763

---

---

# Predicting Weeks

## Training & Prediction

Pipeline needs training method, dataframe and dates to split dataframe in training and test set.

In [28]:
%%time
model = train_xgb(X_train, y_train)

Fitting model...



XGBoostError: bad allocation

In [None]:
pred_train, pred_test = predict_values(model, X_train, y_train, X_test, y_test)

## Evaluation

### train set

In [None]:
df_eval_train = evaluate_pred(X_train, y_train, pred_train)
df_eval_train.head(10)

### test set

In [None]:
df_eval_test = evaluate_pred(X_test, y_test, pred_test)
df_eval_test.head(10)

In [None]:
rowcount = len(df_eval_test)
should = len(df_eval_test.loc[(df_eval_test.nextBuyInWeeks != 0)])
is_ = len(df_eval_test.loc[(df_eval_test.nextBuyInWeeks != 0) & (df_eval_test.nextBuyInWeeks == df_eval_test.nextBuyIn_pred)]) 

print(f'row count of set:\t\t\t\t\t {rowcount}')
print(f'rows where label is not 0:\t\t\t\t {should} \t ({should/rowcount*100:.3f} % of all rows in set)')
print(f'rows where label was predicted correctly AND not 0:\t {is_} \t ({is_/should*100:.3f} % of rows where label is actually not 0)')

---

In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
import gc
import joblib

pd.set_option('display.max_rows', 250)
pd.set_option('display.min_rows', 25)

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
%%time
cols_item_feat = [
    'itemID',
    'brand',
    'feature_1',
    'feature_2',
    'feature_3',
    'feature_4',
    'feature_5',
    #'brandOrderRatio',
    #'feature1OrderRatio',
    #'feature2OrderRatio',
    #'feature3OrderRatio',
    #'feature4OrderRatio',
    #'feature5OrderRatio',
    #'TotalBFscore',
    #'RCP'
]

cols_user_feat = [
    'date',
    'userID',
    'itemID',
    'order',
    #'TotalItemOrders(user)',
    'MeanDiffToNxt(user)'
]

base = r'C:\Users\LEAND\Coding\Jupyter Notebooks\parquet\01_every_day_all-possible-submiss-purchases_labeled.parquet'
df = pd.read_parquet(base, engine='pyarrow')
df

CPU times: total: 359 ms
Wall time: 174 ms


Unnamed: 0,date,userID,itemID,week,year,month,dayOfMonth,weekOfYear,dayOfYear
0,2020-06-01,0,20664,0,2020,6,1,23,153
1,2020-06-01,0,28231,0,2020,6,1,23,153
2,2020-06-01,13,2690,0,2020,6,1,23,153
3,2020-06-01,15,1299,0,2020,6,1,23,153
4,2020-06-01,15,20968,0,2020,6,1,23,153
5,2020-06-01,20,8272,0,2020,6,1,23,153
6,2020-06-01,24,11340,0,2020,6,1,23,153
7,2020-06-01,34,21146,0,2020,6,1,23,153
8,2020-06-01,34,31244,0,2020,6,1,23,153
9,2020-06-01,46,31083,0,2020,6,1,23,153
