In [None]:
#!python -m pip install numpy scipy matplotlib ipython jupyter pandas sympy nose
#!pip install -U scikit-learn
#!pip install xgboost
#!pip install pyarrow
#!pip install fastparquet

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
import gc
import joblib

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

pd.set_option('display.max_rows', 250)
pd.set_option('display.min_rows', 25)

####
# Plays sinus
def playSound():
    from IPython.lib.display import Audio
    framerate = 4410
    play_time_seconds = 3

    t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)
    audio_data = np.sin(2*np.pi*300*t) + np.sin(2*np.pi*200*t)
    return Audio(audio_data, rate=framerate, autoplay=True) # plays 3sec sound, when done
    

####
# prints memory usage
def show_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB\n'.format(start_mem))
    return

####
# seperates features from label (y must be last column)
def sep_X_y(df_train, df_test):
    X_train = df_train.iloc[:,0:-1] # extracts all rows [:] and columns from 0 to next-to-last [0:-1]
    y_train = df_train.iloc[:,-1] # extracts all rows [:] and only last column [-1]
    X_test = df_test.iloc[:,0:-1]
    y_test = df_test.iloc[:,-1]
    
    return [X_train, y_train, X_test, y_test]

####
# split training and test set from given dataframe with dates as boundaries
def dt_train_test_split(df, dt_start_train, dt_end_train, dt_start_test, dt_end_test):
    print('Splitting dataframe...\n')
    # get indices from desired boundaries
    idx_start_train = df.date.searchsorted(pd.to_datetime(dt_start_train), side='left') # list needs to be sorted already for searchsorted
    idx_end_train = df.date.searchsorted(pd.to_datetime(dt_end_train) + pd.Timedelta(days=1), side='left')
    idx_start_test = df.date.searchsorted(pd.to_datetime(dt_start_test), side='left')
    idx_end_test = df.date.searchsorted(pd.to_datetime(dt_end_test) + pd.Timedelta(days=1), side='left')
    
    train = df.iloc[idx_start_train:idx_end_train]
    test = df.iloc[idx_start_test:idx_end_test]
    
    train.drop(columns=['date'], axis=0, inplace=True)
    test.drop(columns=['date'], axis=0, inplace=True)
    
    return sep_X_y(train, test)

####
# trains XGB model (classifier)
def train_xgb(X, y):
    X_train = X
    y_train = y
    
    print('Fitting model...\n')
    model = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
    fitted_model = model.fit(X_train, y_train)
    
    print('Plotting feature importance for "gain". Do not rely on that.\n')
    print('https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27\n')
    xgb.plot_importance(model, importance_type='gain')
    plt.show()
    
    # GRAPHVIZ (software + pip package) needed for tree plotting
    #fig, ax = plt.subplots(figsize=(30, 30))
    #xgb.plot_tree(model, num_trees=0, ax=ax, rankdir='LR')
    #plt.show()
    
    return fitted_model

####
# predicts labels of training and test with given model
def predict_values(model, X_train, y_train, X_test, y_test):
    print('Predicting values...\n')
    # predict y values
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # get accuracies
    model_train = accuracy_score(y_train, y_train_pred)
    model_test = accuracy_score(y_test, y_test_pred)

    # print info about accuracies
    print(f'\n XGboost train/test accuracies: '
         f'{model_train:.3f}/{model_test:.3f}')
    
    # return predicted values
    return [y_train_pred, y_test_pred]

####
# concatenates prediction with actual target for evaluation
def evaluate_pred(X, y, y_pred):
    # create dataframe from test-prediction with index from X_test
    df_y_pred = pd.DataFrame(y_pred, columns=['nextBuyIn_pred'], index=X.index, dtype=np.int8)

    # concatenate X, y, y_pred (put columns next to each other)
    df_eval = pd.concat([X, y, df_y_pred], axis=1)
    
    return df_eval

####
# executes all needed functions of the above with given training and test data and provided train method
def execute_pipeline(train_method, df, list_of_four_df_boundaries):
    b = list_of_four_df_boundaries
    # split dataframe in train/test and X/y
    X_train, y_train, X_test, y_test = dt_train_test_split(df, b[0], b[1], b[2], b[3])
    
    #train model
    model = train_method(X_train, y_train)    
    
    # make predictions
    pred_train, pred_test = predict_values(model, X_train, y_train, X_test, y_test)
    
    print('\nExecuted pipeline.\nEvaluate with "evaluate_pred(X, y, y_pred)"\n')
    return [pred_train, pred_test, X_train, y_train, X_test, y_test]

In [None]:
#Connect Google Drive to Google Colab
'''
from google.colab import drive
drive.mount('/content/drive')
!mkdir /content/parquet/
!cp /content/drive/MyDrive/parquet/220621_02_everyCombinationPerDay_basicFeatures_labeled_XTest.parquet /content/parquet/220621_02_everyCombinationPerDay_basicFeatures_labeled_XTest.parquet
!cp /content/drive/MyDrive/parquet/220621_02_everyCombinationPerDay_basicFeatures_labeled_XTrain.parquet /content/parquet/220621_02_everyCombinationPerDay_basicFeatures_labeled_XTrain.parquet
!cp /content/drive/MyDrive/parquet/220621_02_everyCombinationPerDay_basicFeatures_labeled_yTest.parquet /content/parquet/220621_02_everyCombinationPerDay_basicFeatures_labeled_yTest.parquet
!cp /content/drive/MyDrive/parquet/220621_02_everyCombinationPerDay_basicFeatures_labeled_yTrain.parquet /content/parquet/220621_02_everyCombinationPerDay_basicFeatures_labeled_yTrain.parquet
'''

In [2]:
%%time
path_xtrain = r'E:\OneDrive\Arbeit\Repos\DMC2022\Kevin\csv\220621_02_everyCombinationPerDay_basicFeatures_labeled_XTrain.parquet'
path_ytrain = r'E:\OneDrive\Arbeit\Repos\DMC2022\Kevin\csv\220621_02_everyCombinationPerDay_basicFeatures_labeled_yTrain.parquet'
path_xtest = r'E:\OneDrive\Arbeit\Repos\DMC2022\Kevin\csv\220621_02_everyCombinationPerDay_basicFeatures_labeled_XTest.parquet'
path_ytest = r'E:\OneDrive\Arbeit\Repos\DMC2022\Kevin\csv\220621_02_everyCombinationPerDay_basicFeatures_labeled_yTest.parquet'

X_train = pd.read_parquet(path_xtrain, engine='pyarrow')
y_train = pd.read_parquet(path_ytrain, engine='pyarrow')
X_test = pd.read_parquet(path_xtest, engine='pyarrow')
y_test = pd.read_parquet(path_ytest, engine='pyarrow')

CPU times: total: 33 s
Wall time: 6.05 s


In [3]:
%%time
model = train_xgb(X_train, y_train)

Fitting model...



XGBoostError: [20:35:41] C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/tree/updater_gpu_hist.cu:712: Exception in gpu_hist: [20:35:41] c:\users\administrator\workspace\xgboost-win64_release_1.6.0\src\data\../common/device_helpers.cuh:428: Memory allocation error on worker 0: bad allocation: cudaErrorMemoryAllocation: out of memory
- Free memory: 1389494272
- Requested memory: 2451629456



In [None]:
pred_train, pred_test = predict_values(model, X_train, y_train, X_test, y_test)