In [1]:
import warnings
warnings.filterwarnings("ignore")

import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import gc

In [2]:
# Read csv
df_data = pd.read_csv('E:\OneDrive\Arbeit\Repos\DMC2022\Kevin\csv\complete_dataset_labeled_wLastPurchaseDates.csv', sep='|')
#df_data.head()

In [None]:
df_data.info()

In [None]:
# Convert categories to data type category
#df_data["categories"] = df_data["categories"].astype("category")

# Preprocessing

In [3]:
# Drop columns
df_data.drop('Unnamed: 0', axis=1, inplace=True)
df_data.drop('purchaseDates', axis=1, inplace=True)
df_data.drop('date', axis=1, inplace=True)
df_data.drop('order', axis=1, inplace=True)

df_data.head()

Unnamed: 0,userID,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories,week,lastPurchaseDate
0,38769,3477,186,6,0,196,0,45,"[74, 4109, 3867, 803, 4053]",1,2222-03-03
1,42535,30474,193,10,3,229,3,132,"[3459, 3738, 679, 1628, 4072]",1,2222-03-03
2,42535,15833,1318,4,1,455,0,108,"[2973, 2907, 2749, 3357]",1,2222-03-03
3,42535,20131,347,4,0,291,3,44,"[30, 1515, 1760, 2932, 1287, 2615, 3727, 2450,...",1,2222-03-03
4,42535,4325,539,6,0,303,0,45,"[3104, 1772, 2029, 1274, 3915, 888, 1118, 3882...",1,2222-03-03


### Split datetime into separate columns

In [4]:
# Convert 'lastPurchaseDate' to datetime
df_data['lastPurchaseDate'] = pd.to_datetime(df_data['lastPurchaseDate'], format="%Y-%m-%d")

# Split 'date' into seperate columns
df_data['day'] = df_data['lastPurchaseDate'].dt.day
df_data['month'] = df_data['lastPurchaseDate'].dt.month
df_data['year'] = df_data['lastPurchaseDate'].dt.year

df_data.drop('lastPurchaseDate', axis=1, inplace=True)

In [None]:
"""
# Move new columns to the front
col = df_data.pop("year")
df_data.insert(1, col.name, col)

col = df_data.pop("month")
df_data.insert(2, col.name, col)

col = df_data.pop("day")
df_data.insert(3, col.name, col)
"""

In [None]:
# Show table
#df_data.head()

### Multi-Hot-Encoding for categories

In [5]:
df_data['categories'] = df_data['categories'].str.replace("[", "")
df_data['categories'] = df_data['categories'].str.replace("]", "")

#df_data_chunk_1 = df_data.iloc[:500000,:]
#df_data_chunk_2 = df_data.iloc[500000:,:]

In [6]:
df_multi_hot = df_data["categories"].str.get_dummies(',')
df_combined = df_data.join(df_multi_hot, how='inner')

In [None]:
"""
# WARNING: This Cell takes a couple of minutes to compute; For computation without chunks, ~50GB RAM is needed 

# Multi-hot-encoding
df_multi_hot = df_data_chunk_1["categories"].str.get_dummies(',')

# Combine df_data and df_multi_hot
df_combined_1 = df_data_chunk_1.join(df_multi_hot, how='inner')

# Put week to end
#col = df_combined_1.pop("week")
#df_combined_1.insert(len(df_combined_1.columns), col.name, col)

# delete variables to free up RAM
del col
del df_multi_hot

# Multi-hot-encoding
df_multi_hot = df_data_chunk_2["categories"].str.get_dummies(',')

# Combine df_data and df_multi_hot
df_combined_2 = df_data_chunk_2.join(df_multi_hot, how='inner')

# Put week to end
#col = df_combined_2.pop("week")
#df_combined_2.insert(len(df_combined_2.columns), col.name, col)

# delete variables to free up RAM
del col
del df_multi_hot

# Combine df_combined_1 and df_combined_2
df_combined_final = pd.concat([df_combined_1, df_combined_2])
"""

In [12]:
del df_multi_hot

In [7]:
# Put week to end
col = df_combined.pop("week")
df_combined.insert(len(df_combined.columns), col.name, col)

In [9]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1069416 entries, 0 to 1069415
Columns: 4485 entries, userID to week
dtypes: int64(4484), object(1)
memory usage: 35.7+ GB


In [None]:
df_combined.head()

In [None]:
# Check if we have any missing values
df_combined[df_combined.isnull().any(axis=1)]

### Datatypes for XGBoost

XGBoost natively supports continuous data but not categorical data. In order to use categorical data with XGBoost, we have to use One-Hot-Encoding which converts a column of categorical values into muliple columns of binary values.

# Modeling

In [10]:
df_combined.head()

Unnamed: 0,userID,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories,day,...,968,970,972,975,981,990,994,995,996,week
0,38769,3477,186,6,0,196,0,45,"74, 4109, 3867, 803, 4053",3,...,0,0,0,0,0,0,0,0,0,1
1,42535,30474,193,10,3,229,3,132,"3459, 3738, 679, 1628, 4072",3,...,0,0,0,0,0,0,0,0,0,1
2,42535,15833,1318,4,1,455,0,108,"2973, 2907, 2749, 3357",3,...,0,0,0,0,0,0,0,0,0,1
3,42535,20131,347,4,0,291,3,44,"30, 1515, 1760, 2932, 1287, 2615, 3727, 2450, ...",3,...,0,0,0,0,0,0,0,0,0,1
4,42535,4325,539,6,0,303,0,45,"3104, 1772, 2029, 1274, 3915, 888, 1118, 3882,...",3,...,0,0,0,0,0,0,0,0,0,1


In [11]:
df_combined.drop('categories', axis=1, inplace=True)

In [None]:
df_combined.head()

In [13]:
# Comma is being used to extract a specific column from a 2D array.
# X = data.iloc[:,:-1]
# X = all rows, all columns except the last one 

X = df_combined.iloc[:,0:-1]
X

Unnamed: 0,userID,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,day,month,...,964,968,970,972,975,981,990,994,995,996
0,38769,3477,186,6,0,196,0,45,3,3,...,0,0,0,0,0,0,0,0,0,0
1,42535,30474,193,10,3,229,3,132,3,3,...,0,0,0,0,0,0,0,0,0,0
2,42535,15833,1318,4,1,455,0,108,3,3,...,0,0,0,0,0,0,0,0,0,0
3,42535,20131,347,4,0,291,3,44,3,3,...,0,0,0,0,0,0,0,0,0,0
4,42535,4325,539,6,0,303,0,45,3,3,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1069411,22114,29567,1445,3,0,-1,-1,-1,3,3,...,0,0,0,0,0,0,0,0,0,0
1069412,22114,21068,193,4,0,453,3,108,3,3,...,0,0,0,0,0,0,0,0,0,0
1069413,10900,18270,1383,6,0,537,0,46,3,3,...,0,0,0,0,0,0,0,0,0,0
1069414,17894,31265,1137,4,0,398,0,144,3,3,...,0,0,0,0,0,0,0,0,0,0


In [14]:
y = df_combined.iloc[:,-1]
y

0          1
1          1
2          1
3          1
4          1
          ..
1069411    4
1069412    4
1069413    4
1069414    4
1069415    4
Name: week, Length: 1069416, dtype: int64

In [15]:
# Split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
data_dmatrix = xgb.DMatrix(data=X, label=y)

In [None]:
model1 = XGBClassifier()
model1.fit(X_train, y_train)

In [None]:
model1.score(X_train,y_train), model1.score(X_test,y_test)

In [None]:
model = XGBClassifier(learning_rate=0.1)
model.fit(X_train, y_train)

In [None]:
model.score(X_train,y_train), model.score(X_test,y_test)

In [None]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
#model = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
#model.fit(X_train, y_train)

In [None]:
#xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
#                max_depth = 5, alpha = 10, n_estimators = 10)

In [None]:
#xg_reg.fit(X_train,y_train)

#preds = xg_reg.predict(X_test)

In [None]:
# Je größer der RMSE ist, desto schlechter ist die Anpassung des Modells.
#rmse = np.sqrt(mean_squared_error(y_test, preds))
#print("RMSE: %f" % (rmse))

In [None]:
#!conda list

In [None]:
# WARNING: This Cell takes a couple of minutes to compute; For computation without chunks, ~50GB RAM is needed 

import gc

df_chunk = []
df_split = np.array_split(df_data, 7)

df_split_1 = df_split[0]
df_split_2 = df_split[1]
df_split_3 = df_split[2]
df_split_4 = df_split[3]
df_split_5 = df_split[4]
df_split_6 = df_split[5]
df_split_7 = df_split[6]

# Convert categorical variables into columns
# For-loop not possible without 50GB RAM

df_chunk.append(df_split_1["categories"].str.get_dummies(',')) 
del df_split_1
df_chunk.append(df_split_2["categories"].str.get_dummies(',')) 
del df_split_2
df_chunk.append(df_split_3["categories"].str.get_dummies(',')) 
del df_split_3
df_chunk.append(df_split_4["categories"].str.get_dummies(',')) 
del df_split_4
df_chunk.append(df_split_5["categories"].str.get_dummies(',')) 
del df_split_5
df_chunk.append(df_split_6["categories"].str.get_dummies(',')) 
del df_split_6
df_chunk.append(df_split_7["categories"].str.get_dummies(',')) 
del df_split_7

