In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import random
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor


In [34]:
df = pd.read_csv("Data Bakery Analyze.csv")
df

Unnamed: 0,ticket_number,date,time,article,Quantity,unit_price_euro,day,amount,Index
0,150045,2021-01-02,09:32,TRADITIONAL BAGUETTE,3,1.20,Saturday,3.60,1.0
1,150046,2021-01-02,09:37,TRADITIONAL BAGUETTE,6,1.20,Saturday,7.20,2.0
2,150049,2021-01-02,09:40,CROISSANT,2,1.10,Saturday,2.20,3.0
3,150049,2021-01-02,09:40,TRADITIONAL BAGUETTE,1,1.20,Saturday,1.20,4.0
4,150051,2021-01-02,09:46,PAIN,1,1.15,Saturday,1.15,5.0
...,...,...,...,...,...,...,...,...,...
3090,150142,2021-01-02,15:58,SPECIAL BREAD,8,2.40,Saturday,19.20,
3091,150143,2021-01-02,10:14,SPECIAL BREAD,10,2.00,Saturday,20.00,
3092,150144,2021-01-02,18:42,PAIN AU CHOCOLAT,9,1.50,Saturday,13.50,
3093,150145,2021-01-02,15:54,CROISSANT,4,1.20,Saturday,4.80,


MODELLING DATA

In [35]:
# Inisialisasi LabelEncoder untuk setiap kolom kategori
le_article = LabelEncoder()
le_day = LabelEncoder()

# Melakukan encoding dan menambahkan kolom baru untuk hasil encoding
df['article_encoded'] = le_article.fit_transform(df['article'])
df['day_encoded'] = le_day.fit_transform(df['day'])

# Mengonversi kolom 'time' menjadi jumlah menit sejak tengah malam
df['time_minutes'] = pd.to_datetime(df['time'], format='%H:%M').dt.hour * 60 + pd.to_datetime(df['time'], format='%H:%M').dt.minute


MODELLING USING RANDOM FOREST

In [36]:
# Mendefinisikan fitur dan target
X = df[['time_minutes', 'article_encoded', 'day_encoded', 'unit_price_euro']]
y = df['Quantity']

# Membagi data menjadi set latih dan set uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)

# Definisikan parameter grid untuk tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Inisialisasi GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2)

# Fit model
grid_search.fit(X_train, y_train)

# Ambil model terbaik
best_model = grid_search.best_estimator_

# Prediksi dengan model terbaik
y_pred = best_model.predict(X_test)

# Menghitung metrik evaluasi
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Best Parameters: {grid_search.best_params_}')
print(f'Mean Squared Error: {mse:.2f}')
print(f'R-squared: {r2:.2f}')


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
Mean Squared Error: 0.91
R-squared: 0.25


THE RESULT USING RANDOM FOREST MODEL
( = VALUE OF MSE IS 0.91 WHICH IS VERY GOOD, BUT R2 VALUE IS VERY LOW FOR 0.25)

TRYING USING OTHER MODEL (XG BOOST)

In [37]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Mendefinisikan model XGBoost
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Mendefinisikan parameter yang akan dicari
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, 10],
    'min_child_weight': [1, 3, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0]
}

# Melakukan Grid Search dengan cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           scoring='neg_mean_squared_error', cv=5, verbose=1)

# Melatih model dengan data latih
grid_search.fit(X_train, y_train)

# Menampilkan parameter terbaik
print("Best Parameters:", grid_search.best_params_)

# Menggunakan model terbaik untuk prediksi
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Menghitung metrik evaluasi
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse:.2f}')
print(f'R-squared: {r2:.2f}')


Fitting 5 folds for each of 324 candidates, totalling 1620 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'learning_rate': 0.01, 'max_depth': 10, 'min_child_weight': 5, 'n_estimators': 200, 'subsample': 0.8}
Mean Squared Error: 0.93
R-squared: 0.22


In [38]:
df

Unnamed: 0,ticket_number,date,time,article,Quantity,unit_price_euro,day,amount,Index,article_encoded,day_encoded,time_minutes
0,150045,2021-01-02,09:32,TRADITIONAL BAGUETTE,3,1.20,Saturday,3.60,1.0,69,2,572
1,150046,2021-01-02,09:37,TRADITIONAL BAGUETTE,6,1.20,Saturday,7.20,2.0,69,2,577
2,150049,2021-01-02,09:40,CROISSANT,2,1.10,Saturday,2.20,3.0,18,2,580
3,150049,2021-01-02,09:40,TRADITIONAL BAGUETTE,1,1.20,Saturday,1.20,4.0,69,2,580
4,150051,2021-01-02,09:46,PAIN,1,1.15,Saturday,1.15,5.0,48,2,586
...,...,...,...,...,...,...,...,...,...,...,...,...
3090,150142,2021-01-02,15:58,SPECIAL BREAD,8,2.40,Saturday,19.20,,63,2,958
3091,150143,2021-01-02,10:14,SPECIAL BREAD,10,2.00,Saturday,20.00,,63,2,614
3092,150144,2021-01-02,18:42,PAIN AU CHOCOLAT,9,1.50,Saturday,13.50,,49,2,1122
3093,150145,2021-01-02,15:54,CROISSANT,4,1.20,Saturday,4.80,,18,2,954


In [40]:
datapredict = pd.DataFrame({
    'time_minutes' : [621, 673, 782],
    'article_encoded': le_article.transform(['TRADITIONAL BAGUETTE', 'SPECIAL BREAD', 'KOUIGN AMANN']),  # artikel dalam bentuk numerik
    'day_encoded': le_day.transform(['Saturday', 'Monday', 'Saturday']),  # hari dalam bentuk numerik
    'unit_price_euro': [1.20, 2.40, 2.1]      # harga satuan artikel
})

predicted_quantity = best_model.predict(datapredict)
print(f'Prediksi Kuantitas Terjual: {predicted_quantity}')

Prediksi Kuantitas Terjual: [1.7088407 1.4665405 1.6407534]
