In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv('df_train.csv')

In [3]:
df_train.shape

(4007419, 8)

In [4]:
df_test = pd.read_csv('df_test.csv')

In [5]:
# Compute the correlation matrix for 'sales' and other columns excluding 'date'
corr_sales = df_train.drop(columns=['date']).corr()['sales'].drop('sales')

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(8, 6))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap
sns.heatmap(corr_sales.to_frame(), annot=True, cmap=cmap, center=0, linewidths=.5, cbar_kws={"shrink": .5})

plt.title('Correlation of Sales with Other Features in df_train')
plt.show()


KeyboardInterrupt: 

In [6]:
df = pd.read_csv("df_train.csv")  # Sesuaikan dengan nama file
df['date'] = pd.to_datetime(df['date'])  # Pastikan kolom date dalam format datetime
df = df.sort_values(by='date')  # Urutkan berdasarkan tanggal

In [7]:
df_agg = df.groupby(['date']).agg({
    'total_orders': 'sum',  # Total order di semua warehouse
    'sales': 'sum',  # Total sales (TARGET)
    'sell_price_main': 'mean',  # Harga rata-rata
    'total_discount': 'mean',  # Rata-rata diskon
    'shops_closed': 'sum',  # Berapa banyak toko yang tutup pada hari itu
    'total_holidays': 'sum'  # Total hari libur nasional
}).reset_index()

In [8]:
df_agg['date'] = pd.to_datetime(df_agg['date'])
df_agg['dayofweek'] = df_agg['date'].dt.dayofweek  # Hari dalam seminggu (0 = Senin)
df_agg['month'] = df_agg['date'].dt.month  # Bulan dalam tahun
df_agg['weekofyear'] = df_agg['date'].dt.isocalendar().week  # Minggu dalam tahun
df_agg['year'] = df_agg['date'].dt.year

In [10]:
from sklearn.preprocessing import MinMaxScaler

feature_cols = ['total_orders', 'sell_price_main', 'total_discount', 'shops_closed', 'total_holidays', 'dayofweek', 'month', 'weekofyear', 'year']
target_col = ['sales']

scaler = MinMaxScaler()

In [11]:
def create_sequences(data, lookback=30, horizon=14):
    X, y = [], []
    for i in range(len(data) - lookback - horizon + 1):
        X.append(data[i:(i + lookback), :-1])  # Semua fitur kecuali sales
        y.append(data[i + lookback:i + lookback + horizon, -1])  # Sales 14 hari ke depan
    return np.array(X), np.array(y)
df_agg[feature_cols + target_col] = scaler.fit_transform(df_agg[feature_cols + target_col])
data_values = df_agg[feature_cols + target_col].values  
X, y = create_sequences(data_values, lookback=60, horizon=14)

print(f"Shape X: {X.shape}")  # (samples, 60, fitur)
print(f"Shape y: {y.shape}")  # (samples, 14) -> memprediksi 14 hari ke depan

Shape X: (1329, 60, 9)
Shape y: (1329, 14)


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

print(f"Training Data Shape: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Validation Data Shape: X_val={X_val.shape}, y_val={y_val.shape}")

Training Data Shape: X_train=(1063, 60, 9), y_train=(1063, 14)
Validation Data Shape: X_val=(266, 60, 9), y_val=(266, 14)


In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Inisialisasi model regresi linear
model = LinearRegression()

# Melatih model pada data training
model.fit(X_train.reshape(X_train.shape[0], -1), y_train)

# Memprediksi pada data validation
y_pred = model.predict(X_val.reshape(X_val.shape[0], -1))

# Menghitung Mean Squared Error (MSE)
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.008654684471258068


In [14]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Define the Decision Tree model
model_dt = DecisionTreeRegressor(random_state=42)

# Train the model
model_dt.fit(X_train.reshape(X_train.shape[0], -1), y_train)

# Evaluate the model
y_pred_dt = model_dt.predict(X_val.reshape(X_val.shape[0], -1))
mse_dt = mean_squared_error(y_val, y_pred_dt)
print(f"Validation MSE for Decision Tree: {mse_dt:.4f}")

Validation MSE for Decision Tree: 0.0209


In [15]:
df_test['date'] = pd.to_datetime(df_test['date'])
df_test['dayofweek'] = df_test['date'].dt.dayofweek  # Hari dalam seminggu (0 = Senin)
df_test['month'] = df_test['date'].dt.month  # Bulan dalam tahun
df_test['weekofyear'] = df_test['date'].dt.isocalendar().week  # Minggu dalam tahun
df_test['year'] = df_test['date'].dt.year 

In [16]:
X_test_values = df_test[feature_cols].values

In [20]:
def create_test_sequences(data, lookback=30):
	X = []
	for i in range(len(data) - lookback + 1):
		X.append(data[i:(i + lookback), :])
	return np.array(X)

lookback = 60

# Padding dengan nilai rata-rata atau nol di awal
pad_value = np.mean(X_test_values, axis=0)  # Bisa juga pakai np.zeros(X_test_values.shape[1])
X_test_padded = np.vstack([np.tile(pad_value, (lookback - 1, 1)), X_test_values])

# Membuat sequence tanpa kehilangan data
X_test_seq = create_test_sequences(X_test_padded, lookback)
print(X_test_seq.shape)  # Harus tetap (47021, lookback, fitur)


(47021, 60, 9)


In [21]:
X_test_seq = X_test_seq.astype(np.float32)

# Jika X_test_seq memiliki 3 dimensi (samples, timesteps, features), ubah ke 2D
if len(X_test_seq.shape) == 3:
    X_test_seq_reshaped = X_test_seq.reshape(X_test_seq.shape[0] * X_test_seq.shape[1], X_test_seq.shape[2])
else:
    X_test_seq_reshaped = X_test_seq  # Jika sudah 2D, langsung gunakan

X_test_seq_reshaped = X_test_seq_reshaped.reshape(X_test_seq.shape[0], X_test_seq.shape[1], X_test_seq.shape[2])

# Prediksi menggunakan SVR
predicted_sales = model_dt.predict(X_test_seq_reshaped.reshape(X_test_seq_reshaped.shape[0], -1))

# Periksa kembali bentuk hasil prediksi
print(f"Shape of predicted_sales: {predicted_sales.shape}")

Shape of predicted_sales: (47021, 14)


In [22]:
df_test['id'] = df_test['unique_id'].astype(str) + "_" + df_test['date'].astype(str)

In [23]:
# Ensure the predicted_sales array is reshaped to match the length of df_test
predicted_sales_reshaped = predicted_sales[:len(df_test)].flatten()[:len(df_test)]

df_submission = pd.DataFrame({
    "id": df_test['id'],  
    "sales_hat": predicted_sales_reshaped  # Ensure 1D array
})

In [24]:
df_submission.to_csv("cek3.csv", index=False)