In [1]:
import warnings
warnings.filterwarnings("ignore")

import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# Read csv
df_data = pd.read_csv('E:\OneDrive\Arbeit\Repos\DMC2022\Kevin\csv\complete_dataset_labeled_wLastPurchaseDates_wWeek0.csv', sep='|')
#df_data.head()

In [None]:
#df_data.info()

In [None]:
# Convert categories to data type category
#df_data["categories"] = df_data["categories"].astype("category")

# Preprocessing

In [None]:
# Drop columns
#df_data.drop('Unnamed: 0', axis=1, inplace=True)
#df_data.drop('purchaseDates', axis=1, inplace=True)
#df_data.drop('date', axis=1, inplace=True)
#df_data.drop('order', axis=1, inplace=True)

#df_data.head()

### Split datetime into separate columns

In [None]:
# Convert 'lastPurchaseDate' to datetime
df_data['lastPurchaseDate'] = pd.to_datetime(df_data['lastPurchaseDate'], format="%Y-%m-%d")

# Split 'date' into seperate columns
df_data['day'] = df_data['lastPurchaseDate'].dt.day
df_data['month'] = df_data['lastPurchaseDate'].dt.month
df_data['year'] = df_data['lastPurchaseDate'].dt.year

df_data.drop('lastPurchaseDate', axis=1, inplace=True)

In [None]:
"""
# Move new columns to the front
col = df_data.pop("year")
df_data.insert(1, col.name, col)

col = df_data.pop("month")
df_data.insert(2, col.name, col)

col = df_data.pop("day")
df_data.insert(3, col.name, col)
"""

In [None]:
# Show table
#df_data.head()

### Multi-Hot-Encoding for categories

In [3]:
# If you run out of memory while encoding the whole dataframe at once you can 
# split the dataframe into chunks beforehand 

#df_data_chunk1 = df_data.iloc[:50000,:]
df_data_chunk1 = df_data.sample(250000)
#df_chunk2 = df.iloc[500000:,:]

# Replace symbols in 'categories'
df_data_chunk1['categories'] = df_data_chunk1['categories'].str.replace("[", "")
df_data_chunk1['categories'] = df_data_chunk1['categories'].str.replace("]", "")

In [None]:
df_data_chunk1.head(20)

In [5]:
# WARNING: This Cell takes a couple of minutes to compute; ~50GB RAM needed to compute without chunks

# Convert categorical variable into dummy/indicator variables
df_multi_hot = df_data_chunk1["categories"].str.get_dummies(',')
df_multi_hot

Unnamed: 0,1,1000,1001,1002,1003,1004,1006,1007,1008,1009,...,956,964,968,970,975,981,990,994,995,996
274288,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
359470,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
340254,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1028481,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
908691,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991544,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
381492,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
286771,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
50230,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Combine df_data and df_multi_hot
#pd.merge(df_data_chunk1, df_multi_hot, how='inner', left_index=True, right_index=True)
df_combined = df_data_chunk1.join(df_multi_hot, how='inner')

# delete df_multi_hot to free up RAM
#del df_multi_hot

In [7]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250000 entries, 274288 to 760925
Columns: 4285 entries, userID to 996
dtypes: int64(4283), object(2)
memory usage: 8.0+ GB


In [None]:
df_combined.head()

In [8]:
#df_combined_chunk1 = df_combined.iloc[:500000,:]
col = df_combined.pop("week")
df_combined.insert(3786 , col.name, col)

#df_combined_chunk2 = df_combined.iloc[500000:,:]

In [None]:
df_combined.head()

In [None]:
# Check if we have any missing values
df_combined[df_combined.isnull().any(axis=1)]

### Datatypes for XGBoost

XGBoost natively supports continuous data but not categorical data. In order to use categorical data with XGBoost, we have to use One-Hot-Encoding which converts a column of categorical values into muliple columns of binary values.

# Modeling

In [None]:
df_combined.head()

In [9]:
df_combined.drop('categories', axis=1, inplace=True)
df_combined.drop('lastPurchaseDate', axis=1, inplace=True)
df_combined.drop('Unique count(date)', axis=1, inplace=True)
#df_combined.drop('year', axis=1, inplace=True)
#df_combined.drop('month', axis=1, inplace=True)
#df_combined.drop('day', axis=1, inplace=True)


In [None]:
df_combined.head()

In [10]:
# Comma is being used to extract a specific column from a 2D array.
# X = data.iloc[:,:-1]
# X = all rows, all columns except the last one 

X = df_combined.iloc[:,0:-1]
X

Unnamed: 0,userID,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,1,1000,...,955,956,964,968,970,975,981,990,994,995
274288,5577,2446,539,10,0,4,3,84,0,0,...,0,0,0,0,0,0,0,0,0,0
359470,12830,15667,1201,4,0,30,0,163,0,0,...,0,0,0,0,0,0,0,0,0,0
340254,36024,31849,569,6,0,220,0,17,0,0,...,0,0,0,0,0,0,0,0,0,0
1028481,16849,1322,827,10,0,377,0,39,0,0,...,0,0,0,0,0,0,0,0,0,0
908691,15015,29345,393,4,0,275,3,151,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991544,12357,2787,1324,10,0,421,3,3,0,0,...,0,0,0,0,0,0,0,0,0,0
381492,20462,2323,6,10,3,321,0,87,0,0,...,0,0,0,0,0,0,0,0,0,0
286771,5015,7728,186,10,0,27,3,38,0,0,...,0,0,0,0,0,0,0,0,0,0
50230,13737,19166,748,6,0,308,0,122,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
y = df_combined.iloc[:,-1]
y

274288     0
359470     0
340254     0
1028481    0
908691     0
          ..
991544     0
381492     0
286771     0
50230      0
760925     0
Name: 996, Length: 250000, dtype: int64

In [12]:
# Split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
X_train

In [None]:
X_test

In [13]:
data_dmatrix = xgb.DMatrix(data=X, label=y)

In [None]:
model1 = XGBClassifier()
model1.fit(X_train, y_train)



In [None]:
model1.score(X_train,y_train), model1.score(X_test,y_test)

In [None]:
#prediction for XGBoost
y_pred2 = model1.predict(X_test)#Accuracy
from sklearn import metrics
print('Accuracy Score:', metrics.accuracy_score(y_test,y_pred2))

In [None]:
model2_250k = XGBClassifier()
model2_250k.fit(X_train, y_train)

In [None]:
model2_250k.score(X_train,y_train), model2_250k.score(X_test,y_test)

In [None]:
#prediction for XGBoost
y_pred2 = model1.predict(X_test)#Accuracy
from sklearn import metrics
print('Accuracy Score:', metrics.accuracy_score(y_test,y_pred2))

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier = classifier.fit(X_train,y_train)

In [None]:
#prediction for decision tree
y_pred1 = classifier.predict(X_test)#Accuracy
from sklearn import metrics
print('Accuracy Score:', metrics.accuracy_score(y_test,y_pred1))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred1)
cm

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred2)
cm

In [None]:
# make predictions for test data
y_pred = model1.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
model = XGBClassifier(learning_rate=0.1)
model.fit(X_train, y_train)

In [None]:
model.score(X_train,y_train), model.score(X_test,y_test)

In [None]:
#model = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
#model.fit(X_train, y_train)

In [None]:
#xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
#                max_depth = 5, alpha = 10, n_estimators = 10)

In [None]:
#xg_reg.fit(X_train,y_train)

#preds = xg_reg.predict(X_test)

In [None]:
# Je größer der RMSE ist, desto schlechter ist die Anpassung des Modells.
#rmse = np.sqrt(mean_squared_error(y_test, preds))
#print("RMSE: %f" % (rmse))

In [None]:
#!conda list