# XGBoost on GPU

### Methods & Settings

In [18]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
import gc
import joblib

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

#from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:75% !important; }</style>"))

pd.set_option('display.max_rows', 250)
pd.set_option('display.min_rows', 25)

####
# Plays sinus
def playSound():
    from IPython.lib.display import Audio
    framerate = 4410
    play_time_seconds = 3

    t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)
    audio_data = np.sin(2*np.pi*300*t) + np.sin(2*np.pi*200*t)
    return Audio(audio_data, rate=framerate, autoplay=True) # plays 3sec sound, when done
    

####
# prints memory usage
def show_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB\n'.format(start_mem))
    return

####
# seperates features from label (y must be last column)
def sep_X_y(df_train, df_test):
    X_train = df_train.iloc[:,0:-1] # extracts all rows [:] and columns from 0 to next-to-last [0:-1]
    y_train = df_train.iloc[:,-1] # extracts all rows [:] and only last column [-1]
    X_test = df_test.iloc[:,0:-1]
    y_test = df_test.iloc[:,-1]
    
    return [X_train, y_train, X_test, y_test]

####
# split training and test set from given dataframe with dates as boundaries
def dt_train_test_split(df, dt_start_train, dt_end_train, dt_start_test, dt_end_test):
    print('Splitting dataframe...\n')
    # get indices from desired boundaries
    idx_start_train = df.date.searchsorted(pd.to_datetime(dt_start_train), side='left') # list needs to be sorted already for searchsorted
    idx_end_train = df.date.searchsorted(pd.to_datetime(dt_end_train) + pd.Timedelta(days=1), side='left')
    idx_start_test = df.date.searchsorted(pd.to_datetime(dt_start_test), side='left')
    idx_end_test = df.date.searchsorted(pd.to_datetime(dt_end_test) + pd.Timedelta(days=1), side='left')
    
    train = df.iloc[idx_start_train:idx_end_train]
    test = df.iloc[idx_start_test:idx_end_test]
    
    train.drop(columns=['date'], axis=0, inplace=True)
    test.drop(columns=['date'], axis=0, inplace=True)
    
    return sep_X_y(train, test)

####
# split training and test set from given dataframe with month as boundaries
def mth_train_test_split(df, mth_start_train, mth_end_train, mth_start_test, mth_end_test):
    print('Splitting dataframe...\n')
    
    # get indices from desired boundaries
    idx_start_train = df.month.searchsorted(mth_start_train, side='left') # list needs to be sorted already for searchsorted
    idx_end_train = df.month.searchsorted(mth_end_train + 1, side='left')
    idx_start_test = df.month.searchsorted(mth_start_test, side='left')
    idx_end_test = df.month.searchsorted(mth_end_test + 1, side='left')
    
    train = df.iloc[idx_start_train:idx_end_train]
    test = df.iloc[idx_start_test:idx_end_test]
    
    #train.drop(columns=['date'], axis=0, inplace=True)
    #test.drop(columns=['date'], axis=0, inplace=True)
    
    return sep_X_y(train, test)

####
# trains XGB model (classifier)
def train_xgb(X, y):
    X_train = X
    y_train = y
    
    print('Fitting model...\n')
    model = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
    fitted_model = model.fit(X_train, y_train)
    
    print('Plotting feature importance for "gain". Do not rely on that.\n')
    print('https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27\n')
    xgb.plot_importance(model, importance_type='gain')
    plt.show()
    
    # GRAPHVIZ (software + pip package) needed for tree plotting
    #fig, ax = plt.subplots(figsize=(30, 30))
    #xgb.plot_tree(model, num_trees=0, ax=ax, rankdir='LR')
    #plt.show()
    
    return fitted_model

####
# predicts labels of training and test with given model
def predict_values(model, X_train, y_train, X_test, y_test):
    print('Predicting values...\n')
    # predict y values
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # get accuracies
    model_train = accuracy_score(y_train, y_train_pred)
    model_test = accuracy_score(y_test, y_test_pred)

    # print info about accuracies
    print(f'\n XGboost train/test accuracies: '
         f'{model_train:.3f}/{model_test:.3f}')
    
    # return predicted values
    return [y_train_pred, y_test_pred]

####
# concatenates prediction with actual target for evaluation
def evaluate_pred(X, y, y_pred):
    # create dataframe from test-prediction with index from X_test
    df_y_pred = pd.DataFrame(y_pred, columns=['nextBuyIn_pred'], index=X.index, dtype=np.int8)

    # concatenate X, y, y_pred (put columns next to each other)
    df_eval = pd.concat([X, y, df_y_pred], axis=1)
    
    return df_eval

####
# executes all needed functions of the above with given training and test data and provided train method
def execute_pipeline(train_method, df, list_of_four_df_boundaries):
    b = list_of_four_df_boundaries
    # split dataframe in train/test and X/y
    X_train, y_train, X_test, y_test = dt_train_test_split(df, b[0], b[1], b[2], b[3])
    
    #train model
    model = train_method(X_train, y_train)    
    
    # make predictions
    pred_train, pred_test = predict_values(model, X_train, y_train, X_test, y_test)
    
    print('\nExecuted pipeline.\nEvaluate with "evaluate_pred(X, y, y_pred)"\n')
    return [pred_train, pred_test, X_train, y_train, X_test, y_test]

In [19]:
%%time
#TODO: IMPORT ORDER! 

path = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220621_01_everyCombinationPerDay_basicFeatures_labeled.parquet'
df = pd.read_parquet(path, engine='pyarrow')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244725866 entries, 0 to 244725865
Data columns (total 14 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   userID      uint16
 1   itemID      uint16
 2   brand       int16 
 3   feature_1   int8  
 4   feature_2   uint8 
 5   feature_3   int16 
 6   feature_4   int8  
 7   feature_5   int16 
 8   order       uint8 
 9   year        uint16
 10  month       uint8 
 11  dayofmonth  uint8 
 12  weekofyear  uint8 
 13  week        uint8 
dtypes: int16(3), int8(2), uint16(3), uint8(6)
memory usage: 6.4 GB
CPU times: total: 49.5 s
Wall time: 9.52 s


In [29]:
#savepath = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220621_01_everyCombinationPerDay_basicFeatures_labeled.parquet'
#df_complete.to_parquet(savepath, engine='pyarrow', compression='snappy', index=None)

In [20]:
X_train, y_train, X_test, y_test = mth_train_test_split(df, 6, 10, 11, 11)

Splitting dataframe...



In [25]:
save_Xtrain = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220621_02_everyCombinationPerDay_basicFeatures_labeled_XTrain.parquet'
save_Xtest = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220621_02_everyCombinationPerDay_basicFeatures_labeled_XTest.parquet'
save_ytrain = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220621_02_everyCombinationPerDay_basicFeatures_labeled_yTrain.parquet'
save_ytest = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220621_02_everyCombinationPerDay_basicFeatures_labeled_yTest.parquet'

X_train.to_parquet(save_Xtrain, engine='pyarrow', compression='snappy', index=None)
pd.DataFrame(y_train).to_parquet(save_ytrain, engine='pyarrow', compression='snappy', index=None)
X_test.to_parquet(save_Xtest, engine='pyarrow', compression='snappy', index=None)
pd.DataFrame(y_test).to_parquet(save_ytest, engine='pyarrow', compression='snappy', index=None)

In [26]:
del df
gc.collect()

1763

---

---

# Predicting Weeks

## Training & Prediction

Pipeline needs training method, dataframe and dates to split dataframe in training and test set.

In [28]:
%%time
model = train_xgb(X_train, y_train)

Fitting model...



XGBoostError: bad allocation

In [None]:
pred_train, pred_test = predict_values(model, X_train, y_train, X_test, y_test)

## Evaluation

### train set

In [None]:
df_eval_train = evaluate_pred(X_train, y_train, pred_train)
df_eval_train.head(10)

### test set

In [None]:
df_eval_test = evaluate_pred(X_test, y_test, pred_test)
df_eval_test.head(10)

In [None]:
rowcount = len(df_eval_test)
should = len(df_eval_test.loc[(df_eval_test.nextBuyInWeeks != 0)])
is_ = len(df_eval_test.loc[(df_eval_test.nextBuyInWeeks != 0) & (df_eval_test.nextBuyInWeeks == df_eval_test.nextBuyIn_pred)]) 

print(f'row count of set:\t\t\t\t\t {rowcount}')
print(f'rows where label is not 0:\t\t\t\t {should} \t ({should/rowcount*100:.3f} % of all rows in set)')
print(f'rows where label was predicted correctly AND not 0:\t {is_} \t ({is_/should*100:.3f} % of rows where label is actually not 0)')

---