### Methods & Settings

In [2]:
from IPython.display import HTML
from IPython.display import display

# Taken from https://stackoverflow.com/questions/31517194/how-to-hide-one-specific-cell-input-or-output-in-ipython-notebook
tag = HTML('''<script>
code_show=false; 
function code_toggle() {
    if (code_show){
        $('div.cell.code_cell.rendered.selected div.input').hide();
    } else {
        $('div.cell.code_cell.rendered.selected div.input').show();
    }
    code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<a href="javascript:code_toggle()">HIDE/SHOW CONTENT</a>.''')
display(tag)

############### Write code below ##################

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
import gc
import joblib
import math

import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import XGBRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

pd.set_option('display.max_rows', 250)
pd.set_option('display.min_rows', 25)
pd.set_option('display.max_columns', 50)

####
# prints memory usage
def show_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB\n'.format(start_mem))
    return

####
# seperates features from label (y must be last column)
def sep_X_y(df):
    X = df.iloc[:,0:-1] # extracts all rows [:] and columns from 0 to next-to-last [0:-1]
    y = df.iloc[:,-1] # extracts all rows [:] and only last column [-1]
    
    return [X, y]

####
# split training and test set from given dataframe with month as boundaries
def mth_train_test_split(df, mth_start, mth_end):
    print('Splitting dataframe...\n')
    
    # get indices from desired boundaries
    idx_start = df.month.searchsorted(mth_start_train, side='left') # list needs to be sorted already for searchsorted
    idx_end = df.month.searchsorted(mth_end_train + 1, side='left')
    
    df = df.iloc[idx_start:idx_end]
    
    return df

####
# trains XGB model (regressor)
def train_xgb(X, y):
    
    print('Fitting model...\n')
    model = XGBRegressor(tree_method='gpu_hist', gpu_id=0)
    fitted_model = model.fit(X, y)
    
    #print('Plotting feature importance for "gain". Do not rely on that.\n')
    print('https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27\n')
    xgb.plot_importance(model, importance_type='gain', max_num_features=25)
    plt.show()
    
    # GRAPHVIZ (software + pip package) needed for tree plotting
    #fig, ax = plt.subplots(figsize=(30, 30))
    #xgb.plot_tree(model, num_trees=0, ax=ax, rankdir='LR')
    #plt.show()
    
    return fitted_model

####
# trains LinearRegression model
def train_linReg(X, y):
    
    print('Fitting model...')
    model = LinearRegression()
    fitted_model = model.fit(X, y)
    print('Done!')
    
    #print('Plotting feature importance for "gain". Do not rely on that.\n')
#    print('https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27\n')
#     xgb.plot_importance(model, importance_type='gain', max_num_features=25)
#     plt.show()
    
    # GRAPHVIZ (software + pip package) needed for tree plotting
    #fig, ax = plt.subplots(figsize=(30, 30))
    #xgb.plot_tree(model, num_trees=0, ax=ax, rankdir='LR')
    #plt.show()
    
    return fitted_model

####
# trains XGB model (regressor)
def train_lgbm(X, y):
    
    print('Fitting model...\n')
    model = LGBMRegressor(boosting_type='gbdt', device="gpu")
    fitted_model = model.fit(X, y)
    
    print('Plotting feature importance for "gain".')
    print('https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27\n')
    lgb.plot_importance(model, importance_type='gain', max_num_features=25)
    plt.show()
    
    # GRAPHVIZ (software + pip package) needed for tree plotting
    #fig, ax = plt.subplots(figsize=(30, 30))
    #xgb.plot_tree(model, num_trees=0, ax=ax, rankdir='LR')
    #plt.show()
    
    return fitted_model


def train_xgb_bestHyper(X, y):
    X_train = X
    y_train = y
    space = best_hyperparams
    
    print('Fitting model...\n')
    model = XGBRegressor(tree_method='gpu_hist', gpu_id=0,
                    eta = space['eta'],
                    max_depth = int(space['max_depth']), 
                    gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),
                    min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))

    
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    fitted_model.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="rmse",
            early_stopping_rounds=10,verbose=False)
    
    print('Plotting feature importance for "gain". Do not rely on that.\n')
    print('https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27\n')
    xgb.plot_importance(model, importance_type='gain')
    plt.show()
    
    # GRAPHVIZ (software + pip package) needed for tree plotting
    #fig, ax = plt.subplots(figsize=(30, 30))
    #xgb.plot_tree(model, num_trees=0, ax=ax, rankdir='LR')
    #plt.show()
    
    return fitted_model

def train_dtc(X, y):
    X_train = X
    y_train = y
    
    print('Fitting model...\n')
    model = DecisionTreeRegressor()
    fitted_model = model.fit(X_train, y_train)
    
    #print('Plotting feature importance for "gain". Do not rely on that.\n')
    print('https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27\n')
    #xgb.plot_importance(model, importance_type='gain')
    #plt.show()
    
    # GRAPHVIZ (software + pip package) needed for tree plotting
    #fig, ax = plt.subplots(figsize=(30, 30))
    #xgb.plot_tree(model, num_trees=0, ax=ax, rankdir='LR')
    #plt.show()
    
    return fitted_model

####
# predicts labels of training and test with given model
def predict_values(model, X, y_true):
    print('Predicting values...')
    # predict y values
    y_pred = model.predict(X)
    print('Done!\n')
    
    # get msq
    model_error = mean_squared_error(y_true, y_pred)
    
    # print info about accuracies
    print(f'\t\t\t\t\t\033[1m XGboost Regressor MSE: '
          f' {model_error:.3f}')
    
    print(f'\t\t\t\t\t\033[1m XGboost Regressor RMSE: '
          f' {math.sqrt(model_error):.3f}')
    
    # return predicted values
    return y_pred

####
# concatenates prediction with actual target for evaluation
def concat_ytrue_ypred(X, y_true, y_pred):
    # create dataframe from test-prediction with index from X_test
    df_y_pred = pd.DataFrame(y_pred, columns=['nextBuyIn_pred'], index=X.index, dtype=np.int8)

    # concatenate X, y, y_pred (put columns next to each other)
    df_eval = pd.concat([X, y_true, df_y_pred], axis=1)
    
    return df_eval

####
# executes all needed functions of the above with given training and test data and provided train method
# def execute_pipeline(train_method, df, start_mth, end_mth):
#     b = list_of_four_df_boundaries
#     # split dataframe in train/test and X/y
#     X_train, y_train, X_test, y_test = dt_train_test_split(df, b[0], b[1], b[2], b[3])
    
#     #train model
#     model = train_method(X_train, y_train)    
    
#     # make predictions
#     pred_train, pred_test = predict_values(model, X_train, y_train, X_test, y_test)
    
#     print('\nExecuted pipeline.\nEvaluate with "evaluate_pred(X, y, y_pred)"\n')
#     return [pred_train, pred_test, X_train, y_train, X_test, y_test]

ModuleNotFoundError: No module named 'lightgbm'

# <font color='purple'>Predicting Weeks w/o normalization + categories multihot (Train/Test)</color>


In [12]:
train = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\17-1_220624_4TimeRepurchaser_train.csv'
test = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\17-2_220624_4TimeRepurchaser_test.csv'

columns = [#'date',
           'userID', 
           'itemID',
           'order', 
           'brand', 
           'feature_1', 
           'feature_2', 
           'feature_3', 
           'feature_4', 
           'feature_5',
           'categories',
           'brandOrderRatio',
           'feature1OrderRatio',
           'feature2OrderRatio',
           'feature3OrderRatio',
           'feature4OrderRatio',
           'feature5OrderRatio',
           'TotalBFscore',
           'RCP',
           'MeanDiffToNxt(user)',
           'TotalItemOrders(user)',
           #'TotalItemOrders(item)',
           'date(year)',
           'date(month)',
           #'date(weekOfMonth)',
           'date(dayOfMonth)',
           'date(weekOfYear)',
           'date(dayOfYear)',
           #'nextBuyInWeeks(round)', # label
           'nextBuyInWeeks(floor)', # label
           #'nextBuyInWeekOfYear' # label; schlechte idee
          ]

dtype = {'userID':np.uint16,
         'itemID':np.uint16,
         'order':np.uint8,
         'brand':np.int16,
         'feature_1':np.int8,
         'feature_2':np.uint8,
         'feature_3':np.int16,
         'feature_4':np.int8,
         'feature_5':np.int16,
         'TotalItemOrders(user)':np.uint16,
         'date(year)':np.uint16,
         'date(month)':np.uint8,
         'date(weekOfMonth)':np.uint8,
         'date(dayOfMonth)':np.uint8,
         'date(weekOfYear)':np.uint8,
         'date(dayOfYear)':np.uint16,
         'nextBuyInWeeks(floor)':np.uint8
        }

label = 'nextBuyInWeeks(floor)'

df_train = pd.read_csv(train, sep='|', usecols=columns, dtype=dtype, nrows=None, converters={
    'categories': lambda x: [int(i) for i in x[1:-1].split(',')]
})

df_test = pd.read_csv(test, sep='|', usecols=columns, dtype=dtype, nrows=None, converters={
    'categories': lambda x: [int(i) for i in x[1:-1].split(',')]
})

# add fake column for ensuring all categories from 0 to 4299 are included
df_train.loc[len(df_train)] = [0 if column != 'categories' else [cat for cat in range(0,4300)] for column in df_train.columns]
df_train.index = df_train.index + 1  # add index

df_test.loc[len(df_test)] = [0 if column != 'categories' else [cat for cat in range(0,4300)] for column in df_test.columns]
df_test.index = df_test.index + 1  # add index

df = df_train

In [13]:
df

Unnamed: 0,userID,itemID,prediction
1,0,20664,
2,0,28231,
3,13,2690,
4,15,1299,
5,15,20968,
...,...,...,...
9996,46118,20106,
9997,46124,19677,
9998,46125,12878,
9999,46127,7963,


## Preparation

In [93]:
# multi-hot-encode categories
cats = df["categories"]
mlb = MultiLabelBinarizer(sparse_output=False) # Set to True if output binary array is desired in CSR sparse format
df_multi_hot = pd.DataFrame(mlb.fit_transform(cats), columns=mlb.classes_, index=df.index, dtype=np.int8).astype(pd.SparseDtype(np.uint8,0)) # NaN filled with 0

# drop fake rows from both dataframes (last row) & drop category '9999' standing for missing category
df_multi_hot.drop(index=df.index[-1], axis=0, inplace=True)
df_multi_hot = df_multi_hot.iloc[:,:-1]
df.drop(index=df.index[-1], axis=0, inplace=True)

# join new binarized columns with rest of dataframe
df = df.join(df_multi_hot, how='inner')

if (len(df[df.isnull().any(axis=1)]) > 0):
    raise RuntimeError('Join of multi-hot-encoded categories probably created missing values.')

# drop list of categories, since it's not needed anymore
df.drop('categories', axis=1, inplace=True)

# pop and append 'week' at end of dataframe
col = df.pop(label)
df.insert(len(df.columns), col.name, col)

del df_multi_hot
gc.collect()

#df

0

In [94]:
# save column names
column_headers = list(df.columns)

# split DF in X & y
X_train, y_train = sep_X_y(df)
#X_train

## Training & Prediction

## Linear Regression

Pipeline needs training method, dataframe and dates to split dataframe in training and test set.

In [95]:
model = train_linReg(X_train, y_train)
#model = train_dtc(X_train, y_train)

y_pred = predict_values(model, X_train, y_train)

Fitting model...
Done!
Predicting values...
Done!

					[1m XGboost Regressor MSE:  7.514
					[1m XGboost Regressor RMSE:  2.741


### Evaluation

In [96]:
dtype_X = {'userID':np.uint16,
         'itemID':np.uint16,
         'order':np.uint8,
         'brand':np.int16,
         'feature_1':np.int8,
         'feature_2':np.uint8,
         'feature_3':np.int16,
         'feature_4':np.int8,
         'feature_5':np.int16,
         'TotalItemOrders(user)':np.uint16,
         'date(year)':np.uint16,
         'date(month)':np.uint8,
         #'date(weekOfMonth)':np.uint8,
         'date(dayOfMonth)':np.uint8,
         'date(weekOfYear)':np.uint8,
         'date(dayOfYear)':np.uint16
        }
dtype_y = {'nextBuyInWeeks(floor)':np.uint8}

y_pred = pd.DataFrame(y_pred, index=y_train.index).apply(lambda x: round(x)).astype(np.uint8)

y_pred.set_axis(['nextBuyIn_pred'], axis=1,inplace=True)

# concatenate X, y, y_pred (columns next to each other)
df_eval = pd.concat([X_train, y_train, y_pred], axis=1)

rowcount = len(df_eval)
should = rowcount
is_ = len(df_eval.loc[(df_eval['nextBuyInWeeks(floor)'] == df_eval.nextBuyIn_pred)]) 

print(f'\033[1mrow count of set:\t\t\t\t {rowcount}')
print(f'\033[1mrows where label was predicted correctly:\t {is_} \t ({is_/should*100:.3f} % of rows)')

df_eval

[1mrow count of set:				 39736
[1mrows where label was predicted correctly:	 7418 	 (18.668 % of rows)


Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),date(year),date(month),date(weekOfMonth),date(dayOfMonth),date(weekOfYear),date(dayOfYear),...,4277,4278,4279,4280,4281,4282,4283,4284,4285,4286,4287,4288,4289,4290,4291,4292,4293,4294,4295,4296,4297,4298,4299,nextBuyInWeeks(floor),nextBuyIn_pred
1,276,15667,1,1201,4,0,30,0,163,0.000659,0.023826,0.044684,0.000067,0.033429,0.000773,0.911912,1.000000,4,44.500000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,7
2,276,28708,1,504,10,0,441,3,84,0.000978,0.020648,0.044684,0.001161,0.016194,0.004704,0.775920,1.000000,3,37.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,6
3,532,7644,1,1276,6,0,45,3,48,0.000081,0.005880,0.044684,0.000031,0.016194,0.000575,0.587077,1.000000,3,52.500000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,8
4,752,22963,1,1201,10,0,43,0,147,0.000659,0.020648,0.044684,0.000451,0.033429,0.000283,0.882270,1.000000,7,47.333333,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,7
5,1123,18498,1,1401,4,0,95,0,44,0.000074,0.023826,0.044684,0.000017,0.033429,0.003489,0.930692,1.000000,2,62.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,9
6,1421,20664,1,408,4,0,284,0,66,0.000455,0.023826,0.044684,0.000007,0.033429,0.001987,0.920493,1.000000,2,91.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,11
7,1524,20703,1,745,10,0,503,0,17,0.000519,0.020648,0.044684,0.002691,0.033429,0.002932,0.925138,1.000000,2,57.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,9
8,1524,22341,1,386,10,0,502,0,29,0.000194,0.020648,0.044684,0.000722,0.033429,0.000135,0.879196,1.000000,3,57.500000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,8
9,1524,23688,3,420,4,1,510,0,-1,0.000007,0.023826,0.003734,0.000318,0.033429,0.000000,0.531723,1.000000,7,28.500000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,6
10,1567,29726,1,378,10,0,421,0,3,0.000396,0.020648,0.044684,0.001179,0.033429,0.001126,0.894083,1.000000,2,80.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,10


In [97]:
# multi-hot-encode categories
cats = df_test["categories"]
mlb = MultiLabelBinarizer(sparse_output=False) # Set to True if output binary array is desired in CSR sparse format
df_multi_hot = pd.DataFrame(mlb.fit_transform(cats), columns=mlb.classes_, index=df_test.index, dtype=np.int8).astype(pd.SparseDtype(np.uint8,0)) # NaN filled with 0

# drop fake rows from both dataframes (last row) & drop category '9999' standing for missing category
df_multi_hot.drop(index=df_test.index[-1], axis=0, inplace=True)
df_multi_hot = df_multi_hot.iloc[:,:-1]
df_test.drop(index=df_test.index[-1], axis=0, inplace=True)

# join new binarized columns with rest of dataframe
df_test = df_test.join(df_multi_hot, how='inner')

if (len(df_test[df_test.isnull().any(axis=1)]) > 0):
    raise RuntimeError('Join of multi-hot-encoded categories probably created missing values.')

# drop list of categories, since it's not needed anymore
df_test.drop('categories', axis=1, inplace=True)

# pop and append 'week' at end of dataframe
col = df_test.pop(label)
df_test.insert(len(df_test.columns), col.name, col)

del df_multi_hot
gc.collect()

#df_test

0

In [98]:
X_test, y_test = sep_X_y(df_test)

y_pred = predict_values(model, X_test, y_test)

Predicting values...
Done!

					[1m XGboost Regressor MSE:  16.042
					[1m XGboost Regressor RMSE:  4.005


In [99]:
dtype_X = {'userID':np.uint16,
         'itemID':np.uint16,
         'order':np.uint8,
         'brand':np.int16,
         'feature_1':np.int8,
         'feature_2':np.uint8,
         'feature_3':np.int16,
         'feature_4':np.int8,
         'feature_5':np.int16,
         'TotalItemOrders(user)':np.uint16,
         'date(year)':np.uint16,
         'date(month)':np.uint8,
         #'date(weekOfMonth)':np.uint8,
         'date(dayOfMonth)':np.uint8,
         'date(weekOfYear)':np.uint8,
         'date(dayOfYear)':np.uint16
        }
dtype_y = {'nextBuyInWeeks(floor)':np.uint8}

y_pred = pd.DataFrame(y_pred, index=y_test.index).apply(lambda x: round(x)).astype(np.uint8)

y_pred.set_axis(['nextBuyIn_pred'], axis=1,inplace=True)
# concatenate X, y, y_pred (columns next to each other)
df_eval = pd.concat([X_test, y_test, y_pred], axis=1)

rowcount = len(df_eval)
should = rowcount
is_ = len(df_eval.loc[(df_eval['nextBuyInWeeks(floor)'] == df_eval.nextBuyIn_pred)]) 

print(f'\033[1mrow count of set:\t\t\t\t {rowcount}')
print(f'\033[1mrows where label was predicted correctly:\t {is_} \t ({is_/should*100:.3f} % of rows)')

df_eval

[1mrow count of set:				 13591
[1mrows where label was predicted correctly:	 1968 	 (14.480 % of rows)


Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),date(year),date(month),date(weekOfMonth),date(dayOfMonth),date(weekOfYear),date(dayOfYear),...,4277,4278,4279,4280,4281,4282,4283,4284,4285,4286,4287,4288,4289,4290,4291,4292,4293,4294,4295,4296,4297,4298,4299,nextBuyInWeeks(floor),nextBuyIn_pred
1,21340,16599,1,888,10,0,224,3,132,0.000575,0.027466,0.059594,0.000211,0.021632,0.001015,0.727041,1.000000,4,7.500000,2020,6,3,16,25,168,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,19669,19043,1,186,10,3,27,3,39,0.002310,0.027466,0.002299,0.000486,0.021632,0.000971,0.351915,1.000000,3,10.000000,2020,6,4,22,26,174,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,2
3,5218,6168,3,1445,10,0,-1,-1,178,0.000853,0.027466,0.059594,0.000000,0.000000,0.000148,0.574956,1.000000,7,10.000000,2020,6,4,26,26,178,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3
4,6042,5423,8,449,4,0,535,3,105,0.000425,0.031841,0.059594,0.000335,0.021632,0.000401,0.752364,1.000000,22,7.500000,2020,6,5,29,27,181,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3
5,22362,6657,1,504,10,0,441,3,84,0.001255,0.027466,0.059594,0.001475,0.021632,0.006202,0.775391,1.000000,3,14.000000,2020,6,5,30,27,182,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,4
6,14428,26159,1,1065,10,0,491,3,147,0.000587,0.027466,0.059594,0.002712,0.021632,0.000371,0.739716,1.000000,4,13.500000,2020,7,1,1,27,183,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,23,3
7,33697,1782,3,6,4,3,321,0,144,0.001541,0.031841,0.002299,0.000606,0.044612,0.005556,0.564080,1.000000,9,13.500000,2020,7,1,5,27,187,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,4
8,24338,1932,3,186,4,0,319,0,144,0.002310,0.031841,0.059594,0.000246,0.044612,0.005556,0.955303,1.000000,8,15.000000,2020,7,2,6,28,188,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,4
9,44183,18804,2,1045,6,0,525,0,-1,0.000053,0.007963,0.059594,0.000131,0.044612,0.000000,0.739662,1.000000,7,13.500000,2020,7,2,7,28,189,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,4
10,26563,6524,3,961,6,0,436,3,117,0.000061,0.007963,0.059594,0.000055,0.021632,0.001072,0.590650,1.000000,15,12.333333,2020,7,2,8,28,190,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3


---

# Mean Addition to last purchase

In [18]:
test = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\17-2_220624_4TimeRepurchaser_test.csv'

columns = ['date',
           'userID', 
           'itemID',
           'order', 
           'brand', 
           'feature_1', 
           'feature_2', 
           'feature_3', 
           'feature_4', 
           'feature_5',
           'categories',
           'brandOrderRatio',
           'feature1OrderRatio',
           'feature2OrderRatio',
           'feature3OrderRatio',
           'feature4OrderRatio',
           'feature5OrderRatio',
           'TotalBFscore',
           'RCP',
           'MeanDiffToNxt(user)',
           'TotalItemOrders(user)',
           #'TotalItemOrders(item)',
           'date(year)',
           'date(month)',
           #'date(weekOfMonth)',
           'date(dayOfMonth)',
           'date(weekOfYear)',
           'date(dayOfYear)',
           #'nextBuyInWeeks(round)', # label
           'nextBuyInWeeks(floor)', # label
           #'nextBuyInWeekOfYear' # label; schlechte idee
          ]

dtype = {'userID':np.uint16,
         'itemID':np.uint16,
         'order':np.uint8,
         'brand':np.int16,
         'feature_1':np.int8,
         'feature_2':np.uint8,
         'feature_3':np.int16,
         'feature_4':np.int8,
         'feature_5':np.int16,
         'TotalItemOrders(user)':np.uint16,
         'date(year)':np.uint16,
         'date(month)':np.uint8,
         'date(weekOfMonth)':np.uint8,
         'date(dayOfMonth)':np.uint8,
         'date(weekOfYear)':np.uint8,
         'date(dayOfYear)':np.uint16,
         'nextBuyInWeeks(floor)':np.uint8
        }

label = 'nextBuyInWeeks(floor)'

## Preparation

In [19]:
df_test = pd.read_csv(test, sep='|', usecols=columns, dtype=dtype, nrows=None, converters={
    'categories': lambda x: [int(i) for i in x[1:-1].split(',')]
})


#df_test

Unnamed: 0,date,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),date(year),date(month),date(weekOfMonth),date(dayOfMonth),date(weekOfYear),date(dayOfYear),nextBuyInWeeks(floor)
0,2020-06-16,21340,16599,1,888,10,0,224,3,132,"[2005, 2320, 1883, 87, 1000, 3570, 4030]",0.000575,0.027466,0.059594,0.000211,0.021632,0.001015,0.727041,1.000000,4,7.500000,2020,6,3,16,25,168,1
1,2020-06-22,19669,19043,1,186,10,3,27,3,39,"[836, 2909, 545, 1668, 813, 1515, 3457, 1760, ...",0.002310,0.027466,0.002299,0.000486,0.021632,0.000971,0.351915,1.000000,3,10.000000,2020,6,4,22,26,174,6
2,2020-06-26,5218,6168,3,1445,10,0,-1,-1,178,"[2209, 3424, 2605, 1930, 308, 3688, 284, 3283,...",0.000853,0.027466,0.059594,0.000000,0.000000,0.000148,0.574956,1.000000,7,10.000000,2020,6,4,26,26,178,2
3,2020-06-29,6042,5423,8,449,4,0,535,3,105,"[2629, 1730, 471, 423, 609, 3170, 59, 2744]",0.000425,0.031841,0.059594,0.000335,0.021632,0.000401,0.752364,1.000000,22,7.500000,2020,6,5,29,27,181,1
4,2020-06-30,22362,6657,1,504,10,0,441,3,84,"[2591, 2708]",0.001255,0.027466,0.059594,0.001475,0.021632,0.006202,0.775391,1.000000,3,14.000000,2020,6,5,30,27,182,5
5,2020-07-01,14428,26159,1,1065,10,0,491,3,147,"[274, 1807, 1760, 3132, 3924, 3915, 3912, 3613...",0.000587,0.027466,0.059594,0.002712,0.021632,0.000371,0.739716,1.000000,4,13.500000,2020,7,1,1,27,183,23
6,2020-07-05,33697,1782,3,6,4,3,321,0,144,"[2977, 2844, 1059, 935, 3253, 3113, 990, 1395,...",0.001541,0.031841,0.002299,0.000606,0.044612,0.005556,0.564080,1.000000,9,13.500000,2020,7,1,5,27,187,8
7,2020-07-06,24338,1932,3,186,4,0,319,0,144,"[30, 105, 3727, 692, 1060, 3268]",0.002310,0.031841,0.059594,0.000246,0.044612,0.005556,0.955303,1.000000,8,15.000000,2020,7,2,6,28,188,2
8,2020-07-07,44183,18804,2,1045,6,0,525,0,-1,"[2953, 2079, 1490, 2205, 299, 2487, 675, 4008]",0.000053,0.007963,0.059594,0.000131,0.044612,0.000000,0.739662,1.000000,7,13.500000,2020,7,2,7,28,189,20
9,2020-07-08,26563,6524,3,961,6,0,436,3,117,"[1800, 2255, 1241]",0.000061,0.007963,0.059594,0.000055,0.021632,0.001072,0.590650,1.000000,15,12.333333,2020,7,2,8,28,190,2


In [20]:
df_test['meanPred'] = (df_test['date(weekOfYear)'] + round(df_test['MeanDiffToNxt(user)']/7)).astype(np.uint16)
df_test['y_true'] = df_test['date(weekOfYear)'] + df_test['nextBuyInWeeks(floor)']

rowcount = len(df_test)
should = rowcount
is_ = len(df_test.loc[(df_test['meanPred'] == df_test.y_true)]) 

print(f'\033[1mrow count of set:\t\t\t\t {rowcount}')
print(f'\033[1mrows where label was predicted correctly:\t {is_} \t ({is_/should*100:.3f} % of rows)')

df_test

Unnamed: 0,date,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),date(year),date(month),date(weekOfMonth),date(dayOfMonth),date(weekOfYear),date(dayOfYear),nextBuyInWeeks(floor),meanPred,y_true
0,2020-06-16,21340,16599,1,888,10,0,224,3,132,"[2005, 2320, 1883, 87, 1000, 3570, 4030]",0.000575,0.027466,0.059594,0.000211,0.021632,0.001015,0.727041,1.000000,4,7.500000,2020,6,3,16,25,168,1,26,26
1,2020-06-22,19669,19043,1,186,10,3,27,3,39,"[836, 2909, 545, 1668, 813, 1515, 3457, 1760, ...",0.002310,0.027466,0.002299,0.000486,0.021632,0.000971,0.351915,1.000000,3,10.000000,2020,6,4,22,26,174,6,27,32
2,2020-06-26,5218,6168,3,1445,10,0,-1,-1,178,"[2209, 3424, 2605, 1930, 308, 3688, 284, 3283,...",0.000853,0.027466,0.059594,0.000000,0.000000,0.000148,0.574956,1.000000,7,10.000000,2020,6,4,26,26,178,2,27,28
3,2020-06-29,6042,5423,8,449,4,0,535,3,105,"[2629, 1730, 471, 423, 609, 3170, 59, 2744]",0.000425,0.031841,0.059594,0.000335,0.021632,0.000401,0.752364,1.000000,22,7.500000,2020,6,5,29,27,181,1,28,28
4,2020-06-30,22362,6657,1,504,10,0,441,3,84,"[2591, 2708]",0.001255,0.027466,0.059594,0.001475,0.021632,0.006202,0.775391,1.000000,3,14.000000,2020,6,5,30,27,182,5,29,32
5,2020-07-01,14428,26159,1,1065,10,0,491,3,147,"[274, 1807, 1760, 3132, 3924, 3915, 3912, 3613...",0.000587,0.027466,0.059594,0.002712,0.021632,0.000371,0.739716,1.000000,4,13.500000,2020,7,1,1,27,183,23,29,50
6,2020-07-05,33697,1782,3,6,4,3,321,0,144,"[2977, 2844, 1059, 935, 3253, 3113, 990, 1395,...",0.001541,0.031841,0.002299,0.000606,0.044612,0.005556,0.564080,1.000000,9,13.500000,2020,7,1,5,27,187,8,29,35
7,2020-07-06,24338,1932,3,186,4,0,319,0,144,"[30, 105, 3727, 692, 1060, 3268]",0.002310,0.031841,0.059594,0.000246,0.044612,0.005556,0.955303,1.000000,8,15.000000,2020,7,2,6,28,188,2,30,30
8,2020-07-07,44183,18804,2,1045,6,0,525,0,-1,"[2953, 2079, 1490, 2205, 299, 2487, 675, 4008]",0.000053,0.007963,0.059594,0.000131,0.044612,0.000000,0.739662,1.000000,7,13.500000,2020,7,2,7,28,189,20,30,48
9,2020-07-08,26563,6524,3,961,6,0,436,3,117,"[1800, 2255, 1241]",0.000061,0.007963,0.059594,0.000055,0.021632,0.001072,0.590650,1.000000,15,12.333333,2020,7,2,8,28,190,2,30,30


---

# <font color='red'>PREDICTING FOR SUBMISSION // LinearRegression</color>


In [4]:
train = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220625_complete_feature-list_orderhistory_trainingOhneNull.csv'
predset = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220625_complete_feature-list_orderhistory_testNurNull.csv'

columns = [#'date',
           'userID', 
           'itemID',
           'order', 
           'brand', 
           'feature_1', 
           'feature_2', 
           'feature_3', 
           'feature_4', 
           'feature_5',
           #'categories',
           'brandOrderRatio',
           'feature1OrderRatio',
           'feature2OrderRatio',
           'feature3OrderRatio',
           'feature4OrderRatio',
           'feature5OrderRatio',
           'TotalBFscore',
           'RCP',
           'MeanDiffToNxt(user)',
           'TotalItemOrders(user)',
           #'TotalItemOrders(item)',
           'date(year)',
           'date(month)',
           #'date(weekOfMonth)',
           'date(dayOfMonth)',
           'date(weekOfYear)',
           'date(dayOfYear)',
           #'nextBuyInWeeks(round)', # label
           'nextBuyInWeeks(floor)', # label
           #'nextBuyInWeekOfYear' # label; schlechte idee
          ]

dtype = {'userID':np.uint16,
         'itemID':np.uint16,
         'order':np.uint8,
         'brand':np.int16,
         'feature_1':np.int8,
         'feature_2':np.uint8,
         'feature_3':np.int16,
         'feature_4':np.int8,
         'feature_5':np.int16,
         'TotalItemOrders(user)':np.uint16,
         'date(year)':np.uint16,
         'date(month)':np.uint8,
         'date(weekOfMonth)':np.uint8,
         'date(dayOfMonth)':np.uint8,
         'date(weekOfYear)':np.uint8,
         'date(dayOfYear)':np.uint8,
         'nextBuyInWeeks(floor)':np.uint8
        }

label = 'nextBuyInWeeks(floor)'

## Preparation

In [6]:
df_train = pd.read_csv(train, sep='|', usecols=columns, dtype=dtype, nrows=None, converters={
    'categories': lambda x: [int(i) for i in x[1:-1].split(',')]
})

df_test = pd.read_csv(predset, sep='|', usecols=columns, dtype=dtype, nrows=None, converters={
    'categories': lambda x: [int(i) for i in x[1:-1].split(',')]
})

# add fake column for ensuring all categories from 0 to 4299 are included
df_train.loc[len(df_train)] = [0 if column != 'categories' else [cat for cat in range(0,4300)] for column in df_train.columns]
df_train.index = df_train.index + 1  # add index

df_test.loc[len(df_test)] = [0 if column != 'categories' else [cat for cat in range(0,4300)] for column in df_test.columns]
df_test.index = df_test.index + 1  # add index

df = df_train
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 896427 entries, 1 to 896427
Data columns (total 25 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   userID                 896427 non-null  int64  
 1   itemID                 896427 non-null  int64  
 2   order                  896427 non-null  int64  
 3   brand                  896427 non-null  int64  
 4   feature_1              896427 non-null  int64  
 5   feature_2              896427 non-null  int64  
 6   feature_3              896427 non-null  int64  
 7   feature_4              896427 non-null  int64  
 8   feature_5              896427 non-null  int64  
 9   brandOrderRatio        896427 non-null  float64
 10  feature1OrderRatio     896427 non-null  float64
 11  feature2OrderRatio     896427 non-null  float64
 12  feature3OrderRatio     896427 non-null  float64
 13  feature4OrderRatio     896427 non-null  float64
 14  feature5OrderRatio     896427 non-nu

In [87]:
# # multi-hot-encode categories
# cats = df["categories"]
# mlb = MultiLabelBinarizer(sparse_output=False) # Set to True if output binary array is desired in CSR sparse format
# df_multi_hot = pd.DataFrame(mlb.fit_transform(cats), columns=mlb.classes_, index=df.index, dtype=np.int8).astype(pd.SparseDtype(np.uint8,0)) # NaN filled with 0

# # drop fake rows from both dataframes (last row) & drop category '9999' standing for missing category
# df_multi_hot.drop(index=df.index[-1], axis=0, inplace=True)
# df_multi_hot = df_multi_hot.iloc[:,:-1]
# df.drop(index=df.index[-1], axis=0, inplace=True)

# # join new binarized columns with rest of dataframe
# df = df.join(df_multi_hot, how='inner')

# if (len(df[df.isnull().any(axis=1)]) > 0):
#     raise RuntimeError('Join of multi-hot-encoded categories probably created missing values.')

# # drop list of categories, since it's not needed anymore
# df.drop('categories', axis=1, inplace=True)

# # pop and append 'week' at end of dataframe
# col = df.pop(label)
# df.insert(len(df.columns), col.name, col)

# del df_multi_hot
# gc.collect()

# #df

df.drop(index=df.index[-1], axis=0, inplace=True)

In [88]:
# save column names
column_headers = list(df.columns)

# split DF in X & y
X_train, y_train = sep_X_y(df)
#X_train

## Training & Prediction

### Linear Regression

In [89]:
model = train_linReg(X_train, y_train)
#model = train_dtc(X_train, y_train)

y_pred = predict_values(model, X_train, y_train)

Fitting model...
Done!
Predicting values...
Done!

					[1m XGboost Regressor MSE:  6.203
					[1m XGboost Regressor RMSE:  2.491


### Evaluation

In [90]:
dtype_X = {'userID':np.uint16,
         'itemID':np.uint16,
         'order':np.uint8,
         'brand':np.int16,
         'feature_1':np.int8,
         'feature_2':np.uint8,
         'feature_3':np.int16,
         'feature_4':np.int8,
         'feature_5':np.int16,
         'TotalItemOrders(user)':np.uint16,
         'date(year)':np.uint16,
         'date(month)':np.uint8,
         #'date(weekOfMonth)':np.uint8,
         'date(dayOfMonth)':np.uint8,
         'date(weekOfYear)':np.uint8,
         'date(dayOfYear)':np.uint8
        }
dtype_y = {'nextBuyInWeeks(floor)':np.uint8}

y_pred = pd.DataFrame(y_pred, index=y_train.index).apply(lambda x: round(x)).astype(np.uint8)

y_pred.set_axis(['nextBuyIn_pred'], axis=1,inplace=True)


# concatenate X, y, y_pred (columns next to each other)
df_eval = pd.concat([X_train, y_train, y_pred], axis=1)

rowcount = len(df_eval)
should = rowcount
is_ = len(df_eval.loc[(df_eval['nextBuyInWeeks(floor)'] == df_eval.nextBuyIn_pred)]) 

print(f'\033[1mrow count of set:\t\t\t\t {rowcount}')
print(f'\033[1mrows where label was predicted correctly:\t {is_} \t ({is_/should*100:.3f} % of rows)')

df_eval

[1mrow count of set:				 175112
[1mrows where label was predicted correctly:	 70545 	 (40.286 % of rows)


Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),date(year),date(month),date(dayOfMonth),date(weekOfYear),date(dayOfYear),nextBuyInWeeks(floor),nextBuyIn_pred
1,76,23050,1,1411,4,0,22,0,151,0.007899,0.466804,0.826492,0.008540,0.640224,0.018705,0.940897,0.259374,2,169.000000,2020,6,1,23,153,24,24
2,116,9408,1,322,4,0,536,0,144,0.012288,0.466804,0.826492,0.034208,0.640224,0.086085,0.988336,0.040000,3,114.500000,2020,6,1,23,153,22,16
3,116,25677,1,322,4,0,536,0,144,0.012288,0.466804,0.826492,0.034208,0.640224,0.086085,0.988336,0.207143,3,114.500000,2020,6,1,23,153,22,16
4,135,13660,1,157,4,0,513,0,137,0.010361,0.466804,0.826492,0.004528,0.640224,0.005142,0.933540,0.055556,2,32.000000,2020,6,1,23,153,4,5
5,135,22174,1,504,10,0,441,3,84,0.005653,0.369146,0.826492,0.005184,0.334600,0.050564,0.757337,0.454545,3,40.500000,2020,6,1,23,153,4,6
6,202,26940,1,1258,4,0,487,3,44,0.001213,0.466804,0.826492,0.024224,0.334600,0.059138,0.816166,0.123867,3,77.000000,2020,6,1,23,153,11,11
7,240,7318,1,1335,6,0,421,3,6,0.000549,0.152436,0.826492,0.021874,0.334600,0.002002,0.633827,0.279570,2,71.000000,2020,6,1,23,153,10,10
8,240,26645,1,648,10,0,358,3,24,0.001713,0.369146,0.826492,0.009509,0.334600,0.004315,0.735008,0.106439,2,71.000000,2020,6,1,23,153,10,10
9,244,10341,1,1025,6,0,198,0,17,0.002099,0.152436,0.826492,0.000869,0.640224,0.078879,0.810581,0.228986,3,109.000000,2020,6,1,23,153,15,16
10,276,15667,1,1201,4,0,30,0,163,0.012826,0.466804,0.826492,0.001585,0.640224,0.017563,0.939354,0.138325,8,51.750000,2020,6,1,23,153,8,8


## !! Prediction on Predictionset

In [91]:
# # multi-hot-encode categories
# cats = df_test["categories"]
# mlb = MultiLabelBinarizer(sparse_output=False) # Set to True if output binary array is desired in CSR sparse format
# df_multi_hot = pd.DataFrame(mlb.fit_transform(cats), columns=mlb.classes_, index=df_test.index, dtype=np.int8).astype(pd.SparseDtype(np.uint8,0)) # NaN filled with 0

# # drop fake rows from both dataframes (last row) & drop category '9999' standing for missing category
# df_multi_hot.drop(index=df_test.index[-1], axis=0, inplace=True)
# df_multi_hot = df_multi_hot.iloc[:,:-1]
# df_test.drop(index=df_test.index[-1], axis=0, inplace=True)

# # join new binarized columns with rest of dataframe
# df_test = df_test.join(df_multi_hot, how='inner')

# if (len(df_test[df_test.isnull().any(axis=1)]) > 0):
#     raise RuntimeError('Join of multi-hot-encoded categories probably created missing values.')

# # drop list of categories, since it's not needed anymore
# df_test.drop('categories', axis=1, inplace=True)

# # pop and append 'week' at end of dataframe
# col = df_test.pop(label)
# df_test.insert(len(df_test.columns), col.name, col)

# del df_multi_hot
# gc.collect()

# #df_test

df_test.drop(index=df_test.index[-1], axis=0, inplace=True)

In [92]:
X_test, y_test = sep_X_y(df_test)
gc.collect()
y_pred = model.predict(X_test)

In [107]:
dtype_X = {'userID':np.uint16,
         'itemID':np.uint16,
         'order':np.uint8,
         'brand':np.int16,
         'feature_1':np.int8,
         'feature_2':np.uint8,
         'feature_3':np.int16,
         'feature_4':np.int8,
         'feature_5':np.int16,
         'TotalItemOrders(user)':np.uint16,
         'date(year)':np.uint16,
         'date(month)':np.uint8,
         #'date(weekOfMonth)':np.uint8,
         'date(dayOfMonth)':np.uint8,
         'date(weekOfYear)':np.uint8,
         'date(dayOfYear)':np.uint8
        }
dtype_y = {'nextBuyInWeeks(floor)':np.uint8}

y_pred = pd.DataFrame(y_pred, index=y_test.index).apply(lambda x: round(x)).astype(np.uint8)

y_pred.set_axis(['nextBuyIn_pred'], axis=1,inplace=True)

# concatenate X, y, y_pred (columns next to each other)
df_eval = pd.concat([X_test, y_test, y_pred], axis=1)

rowcount = len(df_eval)
should = rowcount
is_ = len(df_eval.loc[(df_eval['nextBuyInWeeks(floor)'] == df_eval.nextBuyIn_pred)]) 

print(f'\033[1mrow count of set:\t\t\t\t {rowcount}')
print(f'\033[1mrows where label was predicted 0:\t {is_} \t ({is_/should*100:.3f} % of rows)')

df_eval

[1mrow count of set:				 896426
[1mrows where label was predicted 0:	 408566 	 (45.577 % of rows)


Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),date(year),date(month),date(dayOfMonth),date(weekOfYear),date(dayOfYear),nextBuyInWeeks(floor),nextBuyIn_pred
1,4,18860,1,603,10,0,536,3,147,0.001617,0.369146,0.826492,0.034208,0.334600,0.004699,0.747174,0.029126,0,0.0,2020,6,1,23,153,0,1
2,4,30779,1,406,10,1,503,3,17,0.010606,0.369146,0.100607,0.062924,0.334600,0.078879,0.448239,0.078261,0,0.0,2020,6,1,23,153,0,1
3,20,18613,2,1111,4,3,444,3,11,0.011587,0.466804,0.056881,0.005516,0.334600,0.003090,0.410125,0.000000,0,0.0,2020,6,1,23,153,0,1
4,55,9547,1,671,10,0,506,0,17,0.001884,0.369146,0.826492,0.004327,0.640224,0.078879,0.917668,0.090395,0,0.0,2020,6,1,23,153,0,1
5,55,10844,1,1180,10,0,192,0,96,0.021251,0.369146,0.826492,0.002211,0.640224,0.002630,0.888944,0.047619,0,0.0,2020,6,1,23,153,0,1
6,55,17912,1,342,6,0,190,0,96,0.002499,0.152436,0.826492,0.000650,0.640224,0.002630,0.773546,0.113636,0,0.0,2020,6,1,23,153,0,1
7,55,24763,1,186,6,0,207,0,17,0.042279,0.152436,0.826492,0.004937,0.640224,0.078879,0.832124,0.087538,0,0.0,2020,6,1,23,153,0,1
8,76,2787,1,1324,10,0,421,3,3,0.008698,0.369146,0.826492,0.021874,0.334600,0.023124,0.753586,0.182783,0,0.0,2020,6,1,23,153,0,1
9,76,26645,1,648,10,0,358,3,24,0.001713,0.369146,0.826492,0.009509,0.334600,0.004315,0.735008,0.106439,0,0.0,2020,6,1,23,153,0,1
10,89,6287,1,1455,6,2,455,0,122,0.000002,0.152436,0.016020,0.002230,0.640224,0.028051,0.390886,0.000000,0,0.0,2020,6,1,23,153,0,1


In [108]:
df_eval['day'] = df_eval['date(dayOfMonth)'].astype(np.uint16)
df_eval['month'] = df_eval['date(month)'].astype(np.uint16)
df_eval['year'] = df_eval['date(year)'].astype(np.uint16)
df_eval['date'] = pd.to_datetime(df_eval[['year', 'month', 'day']])
df_eval

Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),date(year),date(month),date(dayOfMonth),date(weekOfYear),date(dayOfYear),nextBuyInWeeks(floor),nextBuyIn_pred,day,month,year,date
1,4,18860,1,603,10,0,536,3,147,0.001617,0.369146,0.826492,0.034208,0.334600,0.004699,0.747174,0.029126,0,0.0,2020,6,1,23,153,0,1,1,6,2020,2020-06-01
2,4,30779,1,406,10,1,503,3,17,0.010606,0.369146,0.100607,0.062924,0.334600,0.078879,0.448239,0.078261,0,0.0,2020,6,1,23,153,0,1,1,6,2020,2020-06-01
3,20,18613,2,1111,4,3,444,3,11,0.011587,0.466804,0.056881,0.005516,0.334600,0.003090,0.410125,0.000000,0,0.0,2020,6,1,23,153,0,1,1,6,2020,2020-06-01
4,55,9547,1,671,10,0,506,0,17,0.001884,0.369146,0.826492,0.004327,0.640224,0.078879,0.917668,0.090395,0,0.0,2020,6,1,23,153,0,1,1,6,2020,2020-06-01
5,55,10844,1,1180,10,0,192,0,96,0.021251,0.369146,0.826492,0.002211,0.640224,0.002630,0.888944,0.047619,0,0.0,2020,6,1,23,153,0,1,1,6,2020,2020-06-01
6,55,17912,1,342,6,0,190,0,96,0.002499,0.152436,0.826492,0.000650,0.640224,0.002630,0.773546,0.113636,0,0.0,2020,6,1,23,153,0,1,1,6,2020,2020-06-01
7,55,24763,1,186,6,0,207,0,17,0.042279,0.152436,0.826492,0.004937,0.640224,0.078879,0.832124,0.087538,0,0.0,2020,6,1,23,153,0,1,1,6,2020,2020-06-01
8,76,2787,1,1324,10,0,421,3,3,0.008698,0.369146,0.826492,0.021874,0.334600,0.023124,0.753586,0.182783,0,0.0,2020,6,1,23,153,0,1,1,6,2020,2020-06-01
9,76,26645,1,648,10,0,358,3,24,0.001713,0.369146,0.826492,0.009509,0.334600,0.004315,0.735008,0.106439,0,0.0,2020,6,1,23,153,0,1,1,6,2020,2020-06-01
10,89,6287,1,1455,6,2,455,0,122,0.000002,0.152436,0.016020,0.002230,0.640224,0.028051,0.390886,0.000000,0,0.0,2020,6,1,23,153,0,1,1,6,2020,2020-06-01


### Calculating predicted weekOfYear for next purchase, then convert weekOfYear to week of Feb (1-4)

In [109]:
#df_eval['weekOfYear_pred'] = (df_eval['date(weekOfYear)'] + df_eval['nextBuyIn_pred']) % 53
#df_eval['weekOfYear_pred'] = (df_eval['date(weekOfYear)'] + df_eval['nextBuyIn_pred']) % 53 if (df_eval['date(weekOfYear)'] + df_eval['nextBuyIn_pred']) < 53 else ((df_eval['date(weekOfYear)'] + df_eval['nextBuyIn_pred']) % 53) + 1

# Date calculation:
df_eval['weekOfYear_pred'] = (df_eval['date'] + pd.to_timedelta(df_eval['nextBuyIn_pred'], unit='w')).dt.weekofyear
df_eval.drop(['date', 'year', 'month', 'day'], axis=1, inplace=True)
df_eval

Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),date(year),date(month),date(dayOfMonth),date(weekOfYear),date(dayOfYear),nextBuyInWeeks(floor),nextBuyIn_pred,weekOfYear_pred
1,4,18860,1,603,10,0,536,3,147,0.001617,0.369146,0.826492,0.034208,0.334600,0.004699,0.747174,0.029126,0,0.0,2020,6,1,23,153,0,1,24
2,4,30779,1,406,10,1,503,3,17,0.010606,0.369146,0.100607,0.062924,0.334600,0.078879,0.448239,0.078261,0,0.0,2020,6,1,23,153,0,1,24
3,20,18613,2,1111,4,3,444,3,11,0.011587,0.466804,0.056881,0.005516,0.334600,0.003090,0.410125,0.000000,0,0.0,2020,6,1,23,153,0,1,24
4,55,9547,1,671,10,0,506,0,17,0.001884,0.369146,0.826492,0.004327,0.640224,0.078879,0.917668,0.090395,0,0.0,2020,6,1,23,153,0,1,24
5,55,10844,1,1180,10,0,192,0,96,0.021251,0.369146,0.826492,0.002211,0.640224,0.002630,0.888944,0.047619,0,0.0,2020,6,1,23,153,0,1,24
6,55,17912,1,342,6,0,190,0,96,0.002499,0.152436,0.826492,0.000650,0.640224,0.002630,0.773546,0.113636,0,0.0,2020,6,1,23,153,0,1,24
7,55,24763,1,186,6,0,207,0,17,0.042279,0.152436,0.826492,0.004937,0.640224,0.078879,0.832124,0.087538,0,0.0,2020,6,1,23,153,0,1,24
8,76,2787,1,1324,10,0,421,3,3,0.008698,0.369146,0.826492,0.021874,0.334600,0.023124,0.753586,0.182783,0,0.0,2020,6,1,23,153,0,1,24
9,76,26645,1,648,10,0,358,3,24,0.001713,0.369146,0.826492,0.009509,0.334600,0.004315,0.735008,0.106439,0,0.0,2020,6,1,23,153,0,1,24
10,89,6287,1,1455,6,2,455,0,122,0.000002,0.152436,0.016020,0.002230,0.640224,0.028051,0.390886,0.000000,0,0.0,2020,6,1,23,153,0,1,24


In [96]:
df_final = pd.DataFrame()
df_final['userID'] = df_eval['userID']
df_final['itemID'] = df_eval['itemID']
df_final['year'] = df_eval['date(year)']
df_final['month'] = df_eval['date(month)']
df_final['day'] = df_eval['date(dayOfMonth)']
df_final['weekOfYear'] = df_eval['date(weekOfYear)']
df_final['nextBuyIn_pred'] = df_eval['nextBuyIn_pred']
df_final['weekOfYear_pred'] = df_eval['weekOfYear_pred']
df_final['meanDiffWeeks'] = df_eval['MeanDiffToNxt(user)'].apply(lambda x: round(x/7))
df_final['meanDiffDays'] = df_eval['MeanDiffToNxt(user)']

In [97]:
subm_path = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\submission.csv'
df_submission = pd.read_csv(subm_path, sep='|')
#df_submission

df_submission = df_submission.merge(df_final, how='left', on=['userID', 'itemID'])
#df_submission

In [98]:
# calculate week of February from predicted weekOfYear
def getFebWeek(weekOfYear):
    w = weekOfYear
    if w == 5:
        return 1
    elif w == 6:
        return 2
    elif w == 7:
        return 3
    elif w == 8:
        return 4
    else:
        return 0

In [99]:
df_submission['prediction'] = df_submission['weekOfYear_pred'].apply(getFebWeek)
df_submission

Unnamed: 0,userID,itemID,prediction,year,month,day,weekOfYear,nextBuyIn_pred,weekOfYear_pred,meanDiffWeeks
0,0,20664,0,2020,12,11,50,12,9,14
1,0,28231,2,2021,1,25,4,2,6,5
2,13,2690,3,2020,12,24,52,8,7,10
3,15,1299,1,2021,1,14,2,3,5,6
4,15,20968,3,2021,1,25,4,3,7,5
5,20,8272,0,2020,10,27,44,8,52,8
6,24,11340,0,2020,12,27,52,10,9,11
7,34,21146,0,2020,11,13,46,7,53,8
8,34,31244,0,2021,1,13,2,8,10,11
9,46,31083,0,2021,1,6,1,9,10,12


In [100]:
# path = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220625_submission_01.csv'
# df_submission.to_csv(path, index=False, sep='|')

In [101]:
no_zeros = len(df_submission.loc[df_submission['prediction'] != 0])
print(f'{no_zeros} rows where no 0 was predicted')

duplicateRows = df_submission[df_submission.duplicated(['userID', 'itemID'])]
print(f'{len(duplicateRows)} duplicate rows')

2893 rows where no 0 was predicted
0 duplicate rows


---

## Create new Dataframe from predictions to predict again

In [110]:
df_pred_purchases = df_submission.copy()

In [111]:
df_pred_purchases['date'] = pd.to_datetime(df_pred_purchases[['year', 'month', 'day']])
first_column = df_pred_purchases.pop('date')
df_pred_purchases.insert(0, 'date', first_column)
df_pred_purchases.head(3)

Unnamed: 0,date,userID,itemID,prediction,year,month,day,weekOfYear,nextBuyIn_pred,weekOfYear_pred,meanDiffWeeks
0,2020-12-11,0,20664,0,2020,12,11,50,12,9,14
1,2021-01-25,0,28231,2,2021,1,25,4,2,6,5
2,2020-12-24,13,2690,3,2020,12,24,52,8,7,10


In [112]:
df_pred_purchases.drop(['prediction','year','month','day','weekOfYear','nextBuyIn_pred','weekOfYear_pred','meanDiffWeeks'], axis=1, inplace=True)
df_pred_purchases.head(3)

Unnamed: 0,date,userID,itemID
0,2020-12-11,0,20664
1,2021-01-25,0,28231
2,2020-12-24,13,2690


In [113]:
df_features = df_eval.copy()
df_features.drop(['date(year)','date(month)','date(dayOfMonth)','date(weekOfYear)','date(dayOfYear)','nextBuyInWeeks(floor)','weekOfYear_pred'], axis=1, inplace=True)
df_features.head(3)

Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),nextBuyIn_pred
1,4,18860,1,603,10,0,536,3,147,0.001617,0.369146,0.826492,0.034208,0.3346,0.004699,0.747174,0.029126,0,0.0,1
2,4,30779,1,406,10,1,503,3,17,0.010606,0.369146,0.100607,0.062924,0.3346,0.078879,0.448239,0.078261,0,0.0,1
3,20,18613,2,1111,4,3,444,3,11,0.011587,0.466804,0.056881,0.005516,0.3346,0.00309,0.410125,0.0,0,0.0,1


In [114]:
df_pred_purchases = df_pred_purchases.merge(df_features, how='left', on=['userID', 'itemID'])
df_pred_purchases.head(3)

Unnamed: 0,date,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),nextBuyIn_pred
0,2020-12-11,0,20664,1,408,4,0,284,0,66,0.010779,0.466804,0.826492,0.000383,0.640224,0.036075,0.946785,0.25,3,94.5,12
1,2021-01-25,0,28231,2,193,4,3,468,3,108,0.010567,0.466804,0.056881,0.005503,0.3346,0.019841,0.417777,0.054054,4,33.0,2
2,2020-12-24,13,2690,1,406,4,3,491,0,66,0.010606,0.466804,0.056881,0.037829,0.640224,0.036075,0.590236,0.333333,4,67.0,8


In [115]:
df_pred_purchases['date'] = df_pred_purchases['date'] + pd.to_timedelta(df_pred_purchases['nextBuyIn_pred'], unit='w')

df_pred_purchases['date(year)'] = df_pred_purchases['date'].dt.year
df_pred_purchases['date(month)'] = df_pred_purchases['date'].dt.month
df_pred_purchases['date(dayOfMonth)'] = df_pred_purchases['date'].dt.day
df_pred_purchases['date(weekOfYear)'] = df_pred_purchases['date'].dt.weekofyear
df_pred_purchases['date(dayOfYear)'] = df_pred_purchases['date'].dt.dayofyear

df_pred_purchases['nextBuyInWeeks(floor)'] = 0

df_pred_purchases.sort_values(by=['date'])

df_pred_purchases.drop(['nextBuyIn_pred', 'date'], axis=1, inplace=True)

df_pred_purchases.head(3)

Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),date(year),date(month),date(dayOfMonth),date(weekOfYear),date(dayOfYear),nextBuyInWeeks(floor)
0,0,20664,1,408,4,0,284,0,66,0.010779,0.466804,0.826492,0.000383,0.640224,0.036075,0.946785,0.25,3,94.5,2021,3,5,9,64,0
1,0,28231,2,193,4,3,468,3,108,0.010567,0.466804,0.056881,0.005503,0.3346,0.019841,0.417777,0.054054,4,33.0,2021,2,8,6,39,0
2,13,2690,1,406,4,3,491,0,66,0.010606,0.466804,0.056881,0.037829,0.640224,0.036075,0.590236,0.333333,4,67.0,2021,2,18,7,49,0


In [116]:
df_pred_purchases.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   userID                 10000 non-null  int64  
 1   itemID                 10000 non-null  int64  
 2   order                  10000 non-null  int64  
 3   brand                  10000 non-null  int64  
 4   feature_1              10000 non-null  int64  
 5   feature_2              10000 non-null  int64  
 6   feature_3              10000 non-null  int64  
 7   feature_4              10000 non-null  int64  
 8   feature_5              10000 non-null  int64  
 9   brandOrderRatio        10000 non-null  float64
 10  feature1OrderRatio     10000 non-null  float64
 11  feature2OrderRatio     10000 non-null  float64
 12  feature3OrderRatio     10000 non-null  float64
 13  feature4OrderRatio     10000 non-null  float64
 14  feature5OrderRatio     10000 non-null  float64
 15  Tot

#### Scores, Orders, Means etc. verändern sich bei den prediction-Käufen nicht. Evtl. egal...?