NAMA : SALIM ADISETO
NIM  : A11.2022.14761
DATASET : https://www.kaggle.com/datasets/heeraldedhia/groceries-dataset

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
dataset = pd.read_csv('Groceries_dataset.csv')

# Convert 'Date' column to a datetime format and extract day, month, year
dataset['Date'] = pd.to_datetime(dataset['Date'], format='%d-%m-%Y')
dataset['Day'] = dataset['Date'].dt.day
dataset['Month'] = dataset['Date'].dt.month
dataset['Year'] = dataset['Date'].dt.year

# Drop the original 'Date' column after extracting useful features
dataset = dataset.drop(columns=['Date'])

In [3]:
X = dataset.drop(columns=['itemDescription']).values  # Exclude the target column from X
y = dataset['itemDescription'].values  # Target column

print(X)
print(y)

[[1808   21    7 2015]
 [2552    5    1 2015]
 [2300   19    9 2015]
 ...
 [1097   16    4 2014]
 [1510    3   12 2014]
 [1521   26   12 2014]]
['tropical fruit' 'whole milk' 'pip fruit' ... 'cake bar'
 'fruit/vegetable juice' 'cat food']


In [4]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(X[:, 1:3])  # We impute the 'Day' and 'Month' columns if needed
X[:, 1:3] = imputer.transform(X[:, 1:3])

print(X)

[[1808   21    7 2015]
 [2552    5    1 2015]
 [2300   19    9 2015]
 ...
 [1097   16    4 2014]
 [1510    3   12 2014]
 [1521   26   12 2014]]


In [5]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = ct.fit_transform(X)

print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 155060 stored elements and shape (38765, 3901)>
  Coords	Values
  (0, 786)	1.0
  (0, 3898)	21.0
  (0, 3899)	7.0
  (0, 3900)	2015.0
  (1, 1504)	1.0
  (1, 3898)	5.0
  (1, 3899)	1.0
  (1, 3900)	2015.0
  (2, 1263)	1.0
  (2, 3898)	19.0
  (2, 3899)	9.0
  (2, 3900)	2015.0
  (3, 180)	1.0
  (3, 3898)	12.0
  (3, 3899)	12.0
  (3, 3900)	2015.0
  (4, 1981)	1.0
  (4, 3898)	1.0
  (4, 3899)	2.0
  (4, 3900)	2015.0
  (5, 3838)	1.0
  (5, 3898)	14.0
  (5, 3899)	2.0
  (5, 3900)	2015.0
  (6, 3406)	1.0
  :	:
  (38758, 3900)	2014.0
  (38759, 2299)	1.0
  (38759, 3898)	6.0
  (38759, 3899)	5.0
  (38759, 3900)	2014.0
  (38760, 3377)	1.0
  (38760, 3898)	8.0
  (38760, 3899)	10.0
  (38760, 3900)	2014.0
  (38761, 997)	1.0
  (38761, 3898)	23.0
  (38761, 3899)	2.0
  (38761, 3900)	2014.0
  (38762, 93)	1.0
  (38762, 3898)	16.0
  (38762, 3899)	4.0
  (38762, 3900)	2014.0
  (38763, 499)	1.0
  (38763, 3898)	3.0
  (38763, 3899)	12.0
  (38763, 3900)	2014.0
  (38764,

In [6]:
le = LabelEncoder()
y = le.fit_transform(y)

print(y)

[156 164 109 ...  17  64  24]


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(X_train)
print(X_test)
print(y_train)
print(y_test)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 124048 stored elements and shape (31012, 3901)>
  Coords	Values
  (0, 598)	1.0
  (0, 3898)	8.0
  (0, 3899)	8.0
  (0, 3900)	2015.0
  (1, 3740)	1.0
  (1, 3898)	13.0
  (1, 3899)	9.0
  (1, 3900)	2015.0
  (2, 1364)	1.0
  (2, 3898)	8.0
  (2, 3899)	1.0
  (2, 3900)	2014.0
  (3, 1757)	1.0
  (3, 3898)	1.0
  (3, 3899)	4.0
  (3, 3900)	2014.0
  (4, 672)	1.0
  (4, 3898)	21.0
  (4, 3899)	8.0
  (4, 3900)	2015.0
  (5, 715)	1.0
  (5, 3898)	29.0
  (5, 3899)	8.0
  (5, 3900)	2015.0
  (6, 2550)	1.0
  :	:
  (31005, 3900)	2014.0
  (31006, 1329)	1.0
  (31006, 3898)	7.0
  (31006, 3899)	1.0
  (31006, 3900)	2014.0
  (31007, 3135)	1.0
  (31007, 3898)	10.0
  (31007, 3899)	8.0
  (31007, 3900)	2015.0
  (31008, 3412)	1.0
  (31008, 3898)	18.0
  (31008, 3899)	6.0
  (31008, 3900)	2015.0
  (31009, 751)	1.0
  (31009, 3898)	9.0
  (31009, 3899)	4.0
  (31009, 3900)	2014.0
  (31010, 3883)	1.0
  (31010, 3898)	28.0
  (31010, 3899)	1.0
  (31010, 3900)	2015.0
  (31011, 

In [8]:
sc = StandardScaler(with_mean=False)  # Set with_mean=False to handle sparse matrices
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

print(X_train)
print(X_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 124048 stored elements and shape (31012, 3901)>
  Coords	Values
  (0, 598)	47.075931310860945
  (0, 3898)	0.9074973750863624
  (0, 3899)	2.3404486959067383
  (0, 3900)	4036.241630567228
  (1, 3740)	66.56790472934951
  (1, 3898)	1.474683234515339
  (1, 3899)	2.6330047828950804
  (1, 3900)	4036.241630567228
  (2, 1364)	55.69739960321159
  (2, 3898)	0.9074973750863624
  (2, 3899)	0.2925560869883423
  (2, 3900)	4034.2385329838203
  (3, 1757)	88.05680058348115
  (3, 3898)	0.1134371718857953
  (3, 3899)	1.1702243479533692
  (3, 3900)	4034.2385329838203
  (4, 672)	50.846177048329864
  (4, 3898)	2.3821806096017015
  (4, 3899)	2.3404486959067383
  (4, 3900)	4036.241630567228
  (5, 715)	58.70926730995939
  (5, 3898)	3.2896779846880637
  (5, 3899)	2.3404486959067383
  (5, 3900)	4036.241630567228
  (6, 2550)	44.03692219258333
  :	:
  (31005, 3900)	4034.2385329838203
  (31006, 1329)	66.56790472934951
  (31006, 3898)	0.7940602032005671
  