In [2]:
import pandas as pd
import numpy as np

In [49]:
df = pd.read_csv("df_train.csv")  # Sesuaikan dengan nama file
df['date'] = pd.to_datetime(df['date'])  # Pastikan kolom date dalam format datetime
df = df.sort_values(by='date')  # Urutkan berdasarkan tanggal

In [50]:
df_agg = df.groupby(['date']).agg({
    'total_orders': 'sum',  # Total order di semua warehouse
    'sales': 'sum',  # Total sales (TARGET)
    'sell_price_main': 'mean',  # Harga rata-rata
    'total_discount': 'mean',  # Rata-rata diskon
    'shops_closed': 'sum',  # Berapa banyak toko yang tutup pada hari itu
    'total_holidays': 'sum'  # Total hari libur nasional
}).reset_index()

In [51]:
df_agg['date'] = pd.to_datetime(df_agg['date'])
df_agg['dayofweek'] = df_agg['date'].dt.dayofweek  # Hari dalam seminggu (0 = Senin)
df_agg['month'] = df_agg['date'].dt.month  # Bulan dalam tahun
df_agg['weekofyear'] = df_agg['date'].dt.isocalendar().week  # Minggu dalam tahun
df_agg['year'] = df_agg['date'].dt.year

In [52]:
from sklearn.preprocessing import MinMaxScaler

feature_cols = ['total_orders', 'sell_price_main', 'total_discount', 'shops_closed', 'total_holidays', 'dayofweek', 'month', 'weekofyear', 'year']
target_col = ['sales']

scaler = MinMaxScaler()


In [53]:
import numpy as np

def create_sequences(data, lookback=60, horizon=14):
    X, y = [], []
    for i in range(len(data) - lookback - horizon + 1):
        X.append(data[i:(i + lookback), :-1])  # Semua fitur kecuali sales
        y.append(data[i + lookback:i + lookback + horizon, -1])  # Sales 14 hari ke depan
    return np.array(X), np.array(y)

data_values = df_agg[feature_cols + target_col].values  
X, y = create_sequences(data_values, lookback=60, horizon=14)

X = scaler.fit_transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)
y = scaler.fit_transform(y)
print(f"Shape X: {X.shape}")  # (samples, 60, fitur)
print(f"Shape y: {y.shape}")  # (samples, 14) -> memprediksi 14 hari ke depan

Shape X: (1329, 60, 9)
Shape y: (1329, 14)


In [54]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

print(f"Training Data Shape: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Validation Data Shape: X_val={X_val.shape}, y_val={y_val.shape}")

Training Data Shape: X_train=(1063, 60, 9), y_train=(1063, 14)
Validation Data Shape: X_val=(266, 60, 9), y_val=(266, 14)


In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(y.shape[1]))

model.compile(optimizer='adam', loss='mse')

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model
loss = model.evaluate(X_val, y_val)
print(f"Validation Loss: {loss}")

Epoch 1/50


  super().__init__(**kwargs)


[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - loss: 0.0994 - val_loss: 0.0251
Epoch 2/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - loss: 0.0125 - val_loss: 0.0173
Epoch 3/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - loss: 0.0082 - val_loss: 0.0165
Epoch 4/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - loss: 0.0078 - val_loss: 0.0168
Epoch 5/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - loss: 0.0070 - val_loss: 0.0158
Epoch 6/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - loss: 0.0071 - val_loss: 0.0148
Epoch 7/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - loss: 0.0069 - val_loss: 0.0150
Epoch 8/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - loss: 0.0065 - val_loss: 0.0161
Epoch 9/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [55]:
from sklearn.ensemble import RandomForestRegressor

# Define the Random Forest model
model2 = RandomForestRegressor(n_estimators=150, random_state=42)

# Train the model
model2.fit(X_train.reshape(X_train.shape[0], -1), y_train)

# Evaluate the model
y_pred = model2.predict(X_val.reshape(X_val.shape[0], -1))
mse = np.mean((y_pred - y_val) ** 2)
print(f"Validation MSE: {mse}")

Validation MSE: 0.01555309318293217


NameError: name 'X_train' is not defined

In [121]:
val_weights = pd.read_csv("test_weights.csv")
val_weights = val_weights['weight'].values

In [None]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

# Ensure X_train and y_train have the same number of samples
X_train_flat = X_train.reshape(X_train.shape[0], -1)
y_train_flat = y_train[:, 0]  # Use the first column of y_train to match the number of samples

# Define the SVR model
model3 = SVR(kernel='linear', C=100, gamma=0.01, epsilon=.1)
# Train the model
model3.fit(X_train_flat, y_train_flat)

# Evaluate the model
y_pred_svr = model3.predict(X_val.reshape(X_val.shape[0], -1))
mse_svr = mean_squared_error(y_val[:, 0], y_pred_svr)

print(f"Validation MSE for SVR: {mse_svr:.4f}")


In [31]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

# Flatten the training data
X_train_flat = X_train.reshape(X_train.shape[0], -1)
y_train_flat = y_train[:, 0]

# Flatten the validation data
X_val_flat = X_val.reshape(X_val.shape[0], -1)

# Define and train the XGBoost model
model4 = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, learning_rate=0.1)
model4.fit(X_train_flat, y_train_flat)

# Predict using the validation data
y_pred_xgb = model4.predict(X_val_flat)

# Evaluate with MAE
mae_xgb = mean_absolute_error(y_val[:, 0], y_pred_xgb)
print(f"MAE XGBoost: {mae_xgb:.4f}")

MAE XGBoost: 0.0684


In [56]:
df_test = pd.read_csv("df_test.csv")  # Sesuaikan dengan nama file

In [33]:
df_test.isnull().sum()

unique_id          0
date               0
warehouse          0
total_orders       0
sell_price_main    0
total_discount     0
shops_closed       0
total_holidays     0
dtype: int64

In [57]:
df_test['date'] = pd.to_datetime(df_test['date'])
df_test['dayofweek'] = df_test['date'].dt.dayofweek  # Hari dalam seminggu (0 = Senin)
df_test['month'] = df_test['date'].dt.month  # Bulan dalam tahun
df_test['weekofyear'] = df_test['date'].dt.isocalendar().week  # Minggu dalam tahun
df_test['year'] = df_test['date'].dt.year  # Tahun


In [35]:
df_test.isnull().sum()

unique_id          0
date               0
warehouse          0
total_orders       0
sell_price_main    0
total_discount     0
shops_closed       0
total_holidays     0
dayofweek          0
month              0
weekofyear         0
year               0
dtype: int64

In [58]:
feature_cols1 = ["total_orders", "sell_price_main", "total_discount",
                "shops_closed", "total_holidays",
                "dayofweek", "month", "weekofyear", "year"]


In [59]:
X_test_values = df_test[feature_cols1].values


In [38]:
X_test_values.shape

(47021, 9)

In [15]:
X_test_values = scaler.transform(X_test_values)



ValueError: X has 9 features, but MinMaxScaler is expecting 10 features as input.

In [60]:
def create_test_sequences(data, lookback=60):
	X = []
	for i in range(len(data) - lookback + 1):
		X.append(data[i:(i + lookback), :])
	return np.array(X)

lookback = 60

# Padding dengan nilai rata-rata atau nol di awal
pad_value = np.mean(X_test_values, axis=0)  # Bisa juga pakai np.zeros(X_test_values.shape[1])
X_test_padded = np.vstack([np.tile(pad_value, (lookback - 1, 1)), X_test_values])

# Membuat sequence tanpa kehilangan data
X_test_seq = create_test_sequences(X_test_padded, lookback)
print(X_test_seq.shape)  # Harus tetap (47021, lookback, fitur)


(47021, 60, 9)


In [62]:
X_test_seq = X_test_seq.astype(np.float32)

# Jika X_test_seq memiliki 3 dimensi (samples, timesteps, features), ubah ke 2D
if len(X_test_seq.shape) == 3:
    X_test_seq_reshaped = X_test_seq.reshape(X_test_seq.shape[0] * X_test_seq.shape[1], X_test_seq.shape[2])
else:
    X_test_seq_reshaped = X_test_seq  # Jika sudah 2D, langsung gunakan

X_test_seq_reshaped = scaler.transform(X_test_seq_reshaped)
X_test_seq_reshaped = X_test_seq_reshaped.reshape(X_test_seq.shape[0], X_test_seq.shape[1], X_test_seq.shape[2])

# Prediksi menggunakan SVR
predicted_sales = model4.predict(X_test_seq_reshaped.reshape(X_test_seq_reshaped.shape[0], -1))

# Periksa kembali bentuk hasil prediksi
print(f"Shape of predicted_sales: {predicted_sales.shape}")

ValueError: X has 9 features, but MinMaxScaler is expecting 14 features as input.

In [42]:
df_test['id'] = df_test['unique_id'].astype(str) + "_" + df_test['date'].astype(str)


In [43]:
# Cek apakah ada missing values di df_test
print(df_test.isnull().sum())

# Cek apakah ada missing values di predicted_sales
print(np.isnan(predicted_sales).sum())

# Cek apakah ada baris duplikat di df_test
print("Duplicates in df_test:", df_test.duplicated().sum())

unique_id          0
date               0
warehouse          0
total_orders       0
sell_price_main    0
total_discount     0
shops_closed       0
total_holidays     0
dayofweek          0
month              0
weekofyear         0
year               0
id                 0
dtype: int64
0
Duplicates in df_test: 0


In [44]:
missing_rows = df_test.iloc[len(predicted_sales):]
print(missing_rows)


Empty DataFrame
Columns: [unique_id, date, warehouse, total_orders, sell_price_main, total_discount, shops_closed, total_holidays, dayofweek, month, weekofyear, year, id]
Index: []


In [45]:
print(f"Shape of df_test: {df_test.shape}")  
print(f"Shape of predicted_sales: {predicted_sales.shape}")


Shape of df_test: (47021, 13)
Shape of predicted_sales: (47021,)


In [46]:
df_test.shape

(47021, 13)

In [47]:
# Ensure the predicted_sales array is reshaped to match the length of df_test
predicted_sales_reshaped = predicted_sales[:len(df_test)].flatten()[:len(df_test)]

df_submission = pd.DataFrame({
    "id": df_test['id'],  
    "sales_hat": predicted_sales_reshaped  # Ensure 1D array
})


In [24]:
df_submission.to_csv("submission.csv", index=False)