In [3]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
import gc
import joblib

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:75% !important; }</style>"))

pd.set_option('display.max_rows', 250)
pd.set_option('display.min_rows', 25)

In [2]:
userItemCombinations = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\submission.csv'

columns = [
    'userID', 
    'itemID'
          ]

dtype = {
    'userID':np.uint16,
    'itemID':np.uint16
        }

In [4]:
df = pd.read_csv(userItemCombinations, usecols=columns, sep='|', dtype=dtype)

# create datestrings for every day in order period
dates = [date.strftime('%Y-%m-%d') for date in pd.date_range(start='2020-06-01', end='2021-02-28')]
len(dates)

273

In [5]:
# collect one dataframe for every day with all user/item combinations
one_df_per_date_list = []
for date in dates:        
    df_date = df.copy()
    df_date['date'] = date
    
    col = df_date.pop("date")
    df_date.insert(0, col.name, col)
    one_df_per_date_list.append(df_date)
    
one_df_per_date_list[0]

Unnamed: 0,date,userID,itemID
0,2020-06-01,0,20664
1,2020-06-01,0,28231
2,2020-06-01,13,2690
3,2020-06-01,15,1299
4,2020-06-01,15,20968
5,2020-06-01,20,8272
6,2020-06-01,24,11340
7,2020-06-01,34,21146
8,2020-06-01,34,31244
9,2020-06-01,46,31083


In [9]:
df_complete_dates = pd.concat(one_df_per_date_list, axis=0, join='outer')
len(df_complete_dates)

2730000

In [10]:
# gelabelte orders mit riesenliste vereinen
orderList = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220619_02_orders_wo_notOrderedItems.csv'

columns = [
    'date',
    'userID', 
    'itemID'
          ]

dtype = {
    'userID':np.uint32,
    'itemID':np.uint32
        }

df_orders = pd.read_csv(orderList, usecols=columns, sep='|', dtype=dtype, converters={'date': pd.to_datetime})

In [11]:
#06 Jun
from datetime import datetime, timedelta, date
df_orders.loc[(df_orders['date'] >= datetime(2020,6,1)) & (df_orders['date'] <= datetime(2020,6,7)), ['week']] = 1
df_orders.loc[(df_orders['date'] >= datetime(2020,6,8)) & (df_orders['date'] <= datetime(2020,6,14)), ['week']] = 2
df_orders.loc[(df_orders['date'] >= datetime(2020,6,15)) & (df_orders['date'] <= datetime(2020,6,21)), ['week']] = 3
df_orders.loc[(df_orders['date'] >= datetime(2020,6,22)) & (df_orders['date'] <= datetime(2020,6,30)), ['week']] = 4

#07 Jul
df_orders.loc[(df_orders['date'] >= datetime(2020,7,1)) & (df_orders['date'] <= datetime(2020,7,7)), ['week']] = 1
df_orders.loc[(df_orders['date'] >= datetime(2020,7,8)) & (df_orders['date'] <= datetime(2020,7,14)), ['week']] = 2
df_orders.loc[(df_orders['date'] >= datetime(2020,7,15)) & (df_orders['date'] <= datetime(2020,7,21)), ['week']] = 3
df_orders.loc[(df_orders['date'] >= datetime(2020,7,22)) & (df_orders['date'] <= datetime(2020,7,31)), ['week']] = 4

#08 Aug
df_orders.loc[(df_orders['date'] >= datetime(2020,8,1)) & (df_orders['date'] <= datetime(2020,8,7)), ['week']] = 1
df_orders.loc[(df_orders['date'] >= datetime(2020,8,8)) & (df_orders['date'] <= datetime(2020,8,14)), ['week']] = 2
df_orders.loc[(df_orders['date'] >= datetime(2020,8,15)) & (df_orders['date'] <= datetime(2020,8,21)), ['week']] = 3
df_orders.loc[(df_orders['date'] >= datetime(2020,8,22)) & (df_orders['date'] <= datetime(2020,8,31)), ['week']] = 4

#09 Sep
df_orders.loc[(df_orders['date'] >= datetime(2020,9,1)) & (df_orders['date'] <= datetime(2020,9,7)), ['week']] = 1
df_orders.loc[(df_orders['date'] >= datetime(2020,9,8)) & (df_orders['date'] <= datetime(2020,9,14)), ['week']] = 2
df_orders.loc[(df_orders['date'] >= datetime(2020,9,15)) & (df_orders['date'] <= datetime(2020,9,21)), ['week']] = 3
df_orders.loc[(df_orders['date'] >= datetime(2020,9,22)) & (df_orders['date'] <= datetime(2020,9,30)), ['week']] = 4

#10 Oct
df_orders.loc[(df_orders['date'] >= datetime(2020,10,1)) & (df_orders['date'] <= datetime(2020,10,7)), ['week']] = 1
df_orders.loc[(df_orders['date'] >= datetime(2020,10,8)) & (df_orders['date'] <= datetime(2020,10,14)), ['week']] = 2
df_orders.loc[(df_orders['date'] >= datetime(2020,10,15)) & (df_orders['date'] <= datetime(2020,10,21)), ['week']] = 3
df_orders.loc[(df_orders['date'] >= datetime(2020,10,22)) & (df_orders['date'] <= datetime(2020,10,31)), ['week']] = 4

#11 Nov
df_orders.loc[(df_orders['date'] >= datetime(2020,11,1)) & (df_orders['date'] <= datetime(2020,11,7)), ['week']] = 1
df_orders.loc[(df_orders['date'] >= datetime(2020,11,8)) & (df_orders['date'] <= datetime(2020,11,14)), ['week']] = 2
df_orders.loc[(df_orders['date'] >= datetime(2020,11,15)) & (df_orders['date'] <= datetime(2020,11,21)), ['week']] = 3
df_orders.loc[(df_orders['date'] >= datetime(2020,11,22)) & (df_orders['date'] <= datetime(2020,11,30)), ['week']] = 4

#12 Dec
df_orders.loc[(df_orders['date'] >= datetime(2020,12,1)) & (df_orders['date'] <= datetime(2020,12,7)), ['week']] = 1
df_orders.loc[(df_orders['date'] >= datetime(2020,12,8)) & (df_orders['date'] <= datetime(2020,12,14)), ['week']] = 2
df_orders.loc[(df_orders['date'] >= datetime(2020,12,15)) & (df_orders['date'] <= datetime(2020,12,21)), ['week']] = 3
df_orders.loc[(df_orders['date'] >= datetime(2020,12,22)) & (df_orders['date'] <= datetime(2020,12,31)), ['week']] = 4

#01 Jan
df_orders.loc[(df_orders['date'] >= datetime(2021,1,1)) & (df_orders['date'] <= datetime(2021,1,7)), ['week']] = 1
df_orders.loc[(df_orders['date'] >= datetime(2021,1,8)) & (df_orders['date'] <= datetime(2021,1,14)), ['week']] = 2
df_orders.loc[(df_orders['date'] >= datetime(2021,1,15)) & (df_orders['date'] <= datetime(2021,1,21)), ['week']] = 3
df_orders.loc[(df_orders['date'] >= datetime(2021,1,22)) & (df_orders['date'] <= datetime(2021,1,31)), ['week']] = 4

In [12]:
df_orders['week'] = df_orders['week'].astype(np.uint8)
df_orders['date'] = df_orders['date'].apply(lambda x: x.strftime('%Y-%m-%d'))

In [13]:
df_orders

Unnamed: 0,date,userID,itemID,week
0,2020-06-01,38769,3477,1
1,2020-06-01,42535,30474,1
2,2020-06-01,42535,15833,1
3,2020-06-01,42535,20131,1
4,2020-06-01,42535,4325,1
5,2020-06-01,42535,12919,1
6,2020-06-01,29737,9139,1
7,2020-06-01,29737,5237,1
8,2020-06-01,29737,11535,1
9,2020-06-01,43683,18733,1


In [14]:
df_complete_dates = df_complete_dates.merge(df_orders, how='left', on=['date', 'userID', 'itemID'])

In [15]:
df_complete_dates['week'] = df_complete_dates['week'].fillna(0)
df_complete_dates['week'] = df_complete_dates['week'].astype(np.uint8)

In [17]:
df_complete_dates[:2000]

Unnamed: 0,date,userID,itemID,week
0,2020-06-01,0,20664,0
1,2020-06-01,0,28231,0
2,2020-06-01,13,2690,0
3,2020-06-01,15,1299,0
4,2020-06-01,15,20968,0
5,2020-06-01,20,8272,0
6,2020-06-01,24,11340,0
7,2020-06-01,34,21146,0
8,2020-06-01,34,31244,0
9,2020-06-01,46,31083,0


In [18]:
df_complete_dates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2730033 entries, 0 to 2730032
Data columns (total 4 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   date    object
 1   userID  uint16
 2   itemID  uint16
 3   week    uint8 
dtypes: object(1), uint16(2), uint8(1)
memory usage: 54.7+ MB


In [19]:
path = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220625_01_every_day_all-possible-submiss-purchases_labeled.csv'
df_complete_dates.to_csv(path, sep='|', header=True, index=False, quotechar='"', decimal='.')