## load library yang akan digunakan

In [None]:
import pandas as pd
import kagglehub
import os
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error

  from .autonotebook import tqdm as notebook_tqdm


## load dataset

In [2]:
path = kagglehub.dataset_download("prepinstaprime/europe-bike-store-sales")

data = pd.read_csv(os.path.join(path, 'Sales.csv'))
df = pd.DataFrame(data.values)
df.columns = data.columns
df

Unnamed: 0,Date,Day,Month,Year,Customer_Age,Age_Group,Customer_Gender,Country,State,Product_Category,Sub_Category,Product,Order_Quantity,Unit_Cost,Unit_Price,Profit,Cost,Revenue
0,2013-11-26,26,November,2013,19,Youth (<25),M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,590,360,950
1,2015-11-26,26,November,2015,19,Youth (<25),M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,590,360,950
2,2014-03-23,23,March,2014,49,Adults (35-64),M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,23,45,120,1366,1035,2401
3,2016-03-23,23,March,2016,49,Adults (35-64),M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,20,45,120,1188,900,2088
4,2014-05-15,15,May,2014,47,Adults (35-64),F,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,4,45,120,238,180,418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113031,2016-04-12,12,April,2016,41,Adults (35-64),M,United Kingdom,England,Clothing,Vests,"Classic Vest, S",3,24,64,112,72,184
113032,2014-04-02,2,April,2014,18,Youth (<25),M,Australia,Queensland,Clothing,Vests,"Classic Vest, M",22,24,64,655,528,1183
113033,2016-04-02,2,April,2016,18,Youth (<25),M,Australia,Queensland,Clothing,Vests,"Classic Vest, M",22,24,64,655,528,1183
113034,2014-03-04,4,March,2014,37,Adults (35-64),F,France,Seine (Paris),Clothing,Vests,"Classic Vest, L",24,24,64,684,576,1260


## formatting data

In [7]:
# Pastikan kolom numerik bertipe numerik (int/float)
numeric_cols = ['Customer_Age', 'Order_Quantity', 'Unit_Cost', 'Unit_Price', 'Profit', 'Cost', 'Revenue']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows dengan missing value pada kolom yang akan digunakan untuk clustering
df = df.dropna(subset=numeric_cols)

df


Unnamed: 0,Date,Day,Month,Year,Customer_Age,Age_Group,Customer_Gender,Country,State,Product_Category,Sub_Category,Product,Order_Quantity,Unit_Cost,Unit_Price,Profit,Cost,Revenue
0,2013-11-26,26,November,2013,19,Youth (<25),M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,590,360,950
1,2015-11-26,26,November,2015,19,Youth (<25),M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,590,360,950
2,2014-03-23,23,March,2014,49,Adults (35-64),M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,23,45,120,1366,1035,2401
3,2016-03-23,23,March,2016,49,Adults (35-64),M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,20,45,120,1188,900,2088
4,2014-05-15,15,May,2014,47,Adults (35-64),F,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,4,45,120,238,180,418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113031,2016-04-12,12,April,2016,41,Adults (35-64),M,United Kingdom,England,Clothing,Vests,"Classic Vest, S",3,24,64,112,72,184
113032,2014-04-02,2,April,2014,18,Youth (<25),M,Australia,Queensland,Clothing,Vests,"Classic Vest, M",22,24,64,655,528,1183
113033,2016-04-02,2,April,2016,18,Youth (<25),M,Australia,Queensland,Clothing,Vests,"Classic Vest, M",22,24,64,655,528,1183
113034,2014-03-04,4,March,2014,37,Adults (35-64),F,France,Seine (Paris),Clothing,Vests,"Classic Vest, L",24,24,64,684,576,1260


## ubah index dataframe

In [18]:
df_train = df.reset_index(drop=True)
df_train


Unnamed: 0,Date,Day,Month,Year,Customer_Age,Age_Group,Customer_Gender,Country,State,Product_Category,Sub_Category,Product,Order_Quantity,Unit_Cost,Unit_Price,Profit,Cost,Revenue
0,2013-11-26,26,November,2013,19,Youth (<25),M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,590,360,950
1,2015-11-26,26,November,2015,19,Youth (<25),M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,590,360,950
2,2014-03-23,23,March,2014,49,Adults (35-64),M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,23,45,120,1366,1035,2401
3,2016-03-23,23,March,2016,49,Adults (35-64),M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,20,45,120,1188,900,2088
4,2014-05-15,15,May,2014,47,Adults (35-64),F,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,4,45,120,238,180,418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113031,2016-04-12,12,April,2016,41,Adults (35-64),M,United Kingdom,England,Clothing,Vests,"Classic Vest, S",3,24,64,112,72,184
113032,2014-04-02,2,April,2014,18,Youth (<25),M,Australia,Queensland,Clothing,Vests,"Classic Vest, M",22,24,64,655,528,1183
113033,2016-04-02,2,April,2016,18,Youth (<25),M,Australia,Queensland,Clothing,Vests,"Classic Vest, M",22,24,64,655,528,1183
113034,2014-03-04,4,March,2014,37,Adults (35-64),F,France,Seine (Paris),Clothing,Vests,"Classic Vest, L",24,24,64,684,576,1260


## menentukan kolom yang digunakan sebagai class target forecasting

In [None]:
y = df['Revenue'].values
y

array([ 950,  950, 2401, ..., 1183, 1260, 1207], shape=(113036,))

## menentukan kolom apa saja yang digunakan sebagai fitur-fitur

In [20]:
columns = numeric_cols
features = df[columns].values
features

array([[  19,    8,   45, ...,  590,  360,  950],
       [  19,    8,   45, ...,  590,  360,  950],
       [  49,   23,   45, ..., 1366, 1035, 2401],
       ...,
       [  18,   22,   24, ...,  655,  528, 1183],
       [  37,   24,   24, ...,  684,  576, 1260],
       [  37,   23,   24, ...,  655,  552, 1207]], shape=(113036, 7))

## menyiapkan data fitur untuk proses training

In [21]:

scaler = StandardScaler().fit(features)
x = scaler.transform(features)
x

array([[-1.53505582, -0.40804595, -0.40429793, ...,  0.67186191,
        -0.12354319,  0.14943954],
       [-1.53505582, -0.40804595, -0.40429793, ...,  0.67186191,
        -0.12354319,  0.14943954],
       [ 1.18680119,  1.16069397, -0.40429793, ...,  2.38154444,
         0.63928745,  1.25784403],
       ...,
       [-1.62578439,  1.05611131, -0.44249134, ...,  0.81506985,
         0.06631689,  0.32742593],
       [ 0.09805838,  1.26527663, -0.44249134, ...,  0.87896263,
         0.12056262,  0.38624546],
       [ 0.09805838,  1.16069397, -0.44249134, ...,  0.81506985,
         0.09343975,  0.34575929]], shape=(113036, 7))

## melakukan pelatihan menggunakan SVR scikit learn untuk membuat model forecasting 

In [22]:

SVR_model = SVR(kernel='linear', C=100, gamma=.001).fit(x, y)
SVR_model

0,1,2
,kernel,'linear'
,degree,3
,gamma,0.001
,coef0,0.0
,tol,0.001
,C,100
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


# testing

## buat data uji dengan menggunakan dataframe yang sudah digunakan sebelumnya, ambil 3 data sercara acak untuk data uji.

In [23]:
d_test = df_train.sample(n=3)
d_test

Unnamed: 0,Date,Day,Month,Year,Customer_Age,Age_Group,Customer_Gender,Country,State,Product_Category,Sub_Category,Product,Order_Quantity,Unit_Cost,Unit_Price,Profit,Cost,Revenue
78938,2014-04-17,17,April,2014,25,Young Adults (25-34),F,Germany,Nordrhein-Westfalen,Accessories,Tires and Tubes,Touring Tire Tube,22,2,5,52,44,96
27469,2015-09-08,8,September,2015,33,Young Adults (25-34),M,United States,California,Clothing,Gloves,"Half-Finger Gloves, M",2,9,24,29,18,47
80119,2014-01-11,11,January,2014,36,Adults (35-64),F,United States,California,Accessories,Tires and Tubes,Patch Kit/8 Patches,16,1,2,15,16,31


## menentukan kolom yang digunakan sebagai class target

In [24]:
y_true = d_test['Revenue'].values
y_true

array([96, 47, 31])

## menentukan kolom yang digunakan untuk fitur pada data uji

In [25]:
features_test = d_test[list(columns)].values
features_test

array([[25, 22,  2,  5, 52, 44, 96],
       [33,  2,  9, 24, 29, 18, 47],
       [36, 16,  1,  2, 15, 16, 31]])

## menyiapkan data fitur untuk proses testing

In [26]:

scaler_test = StandardScaler().fit(features_test)
testdata = scaler_test.transform(features_test)
testdata

array([[-1.3641205 ,  1.03422447, -0.56195149, -0.54750873,  1.3111818 ,
         1.41131261,  1.37419232],
       [ 0.35897908, -1.35244738,  1.40487872,  1.40299112, -0.19667727,
        -0.62725005, -0.39779251],
       [ 1.00514142,  0.31822291, -0.84292723, -0.85548239, -1.11450453,
        -0.78406256, -0.97639981]])

## melakukan prediksi menggunakan model klasifikasi atau classifier

In [27]:
predict = SVR_model.predict(testdata)
predict

array([2581.86386321,  155.2657279 , -474.02729634])

## Man Absolute Deviation (MAD)

In [28]:
mean_absolute_error(y_true, predict)

1033.052295818189