In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_percentage_error

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_absolute_percentage_error
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

In [16]:
data = pd.read_csv('train.csv')

data['num_sold']=data['num_sold'].fillna(data.groupby(['country','store','product'])['num_sold'].transform('mean'))
# Interpolacja w obrębie grupy 'country' i 'store' za pomocą transformacji
data['num_sold'] = data.groupby(['country', 'store'])['num_sold'].transform(
    lambda group: group.interpolate(method='linear', limit_direction='both', axis=0)
)

data["date"] = pd.to_datetime(data["date"])
data["year"] = data["date"].dt.year
data["month"] = data["date"].dt.month
data["day_of_week"] = data["date"].dt.dayofweek
data["is_weekend"] = data["day_of_week"].isin([5, 6]).astype(int)
data["quarter"] = data["date"].dt.quarter

# 3. Uzupełnienie brakujących wartości
# Wypełniamy medianą grupując po kraju, sklepie i produkcie
data["num_sold"] = data.groupby(["country", "store", "product"])["num_sold"].transform(lambda x: x.fillna(x.median()))

# 4. Dodanie lag_features (opóźnione wartości num_sold)
data = data.sort_values(by=["country", "store", "product", "date"])
# data["lag_1"] = data.groupby(["country", "store", "product"])["num_sold"].shift(1)
# data["lag_7"] = data.groupby(["country", "store", "product"])["num_sold"].shift(7)
# data["lag_30"] = data.groupby(["country", "store", "product"])["num_sold"].shift(30)
data.fillna(0, inplace=True)  # Wypełniamy brakujące wartości zerami dla lagów

# 5. Kodowanie zmiennych kategorycznych
le_country = LabelEncoder()
data["country_encoded"] = le_country.fit_transform(data["country"])

le_store = LabelEncoder()
data["store_encoded"] = le_store.fit_transform(data["store"])

le_product = LabelEncoder()
data["product_encoded"] = le_product.fit_transform(data["product"])

# 6. Normalizacja cech
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(data[["year", "month", "day_of_week", "is_weekend", "quarter"]])
scaled_features_df = pd.DataFrame(scaled_features, columns=["year_scaled", "month_scaled", "day_of_week_scaled", "is_weekend_scaled", "quarter_scaled"])
data = pd.concat([data.reset_index(drop=True), scaled_features_df], axis=1)

# 7. Przygotowanie zbioru treningowego i testowego
train_data = data #[data["year"] < 2017]
# test_data = data[data["year"] == 2016]

X_train = train_data[["year_scaled", "month_scaled", "day_of_week_scaled", "is_weekend_scaled", "quarter_scaled", "country_encoded", "store_encoded", "product_encoded"]]
y_train = train_data["num_sold"]

# X_test = test_data[["year_scaled", "month_scaled", "day_of_week_scaled", "is_weekend_scaled", "quarter_scaled", "lag_1_scaled", "lag_7_scaled", "lag_30_scaled", "country_encoded", "store_encoded", "product_encoded"]]
# y_test = test_data["num_sold"]

# 8. Trening modelu XGBoost
model = XGBRegressor(objective="reg:squarederror", n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
model.fit(X_train, y_train)

# 9. Predykcja i ocena modelu
# predictions = model.predict(X_test)
# mape = mean_absolute_percentage_error(y_test, predictions)
# print(f"MAPE: {mape:.3f}")




In [18]:
# Załaduj nowe dane testowe
new_data = pd.read_csv('test.csv')  # Twoje dane testowe z lat 2017-2019
new_data["date"] = pd.to_datetime(new_data["date"])

# Wyciągnij cechy daty
new_data["year"] = new_data["date"].dt.year
new_data["month"] = new_data["date"].dt.month
new_data["day_of_week"] = new_data["date"].dt.dayofweek
new_data["is_weekend"] = new_data["day_of_week"].isin([5, 6]).astype(int)
new_data["quarter"] = new_data["date"].dt.quarter
new_data["num_sold"] = None
# Połącz dane historyczne i nowe dane testowe
combined_data = pd.concat([data, new_data], ignore_index=True)

# Sortuj dane przed obliczaniem lagów
combined_data = combined_data.sort_values(by=["country", "store", "product", "date"])

# Dodaj lag features
# combined_data["lag_1"] = combined_data.groupby(["country", "store", "product"])["num_sold"].shift(1)
# combined_data["lag_7"] = combined_data.groupby(["country", "store", "product"])["num_sold"].shift(7)
# combined_data["lag_30"] = combined_data.groupby(["country", "store", "product"])["num_sold"].shift(30)

# Wypełnij brakujące lag wartości zerami (tylko dla nowych danych testowych)
# combined_data.loc[combined_data["num_sold"].isna(), ["lag_1", "lag_7", "lag_30"]] = 0

# Zakoduj zmienne kategoryczne
combined_data["country_encoded"] = le_country.transform(combined_data["country"])
combined_data["store_encoded"] = le_store.transform(combined_data["store"])
combined_data["product_encoded"] = le_product.transform(combined_data["product"])

# Skalowanie cech
scaled_features = scaler.transform(combined_data[["year", "month", "day_of_week", "is_weekend", "quarter"]])
scaled_features_df = pd.DataFrame(scaled_features, columns=["year_scaled", "month_scaled", "day_of_week_scaled", "is_weekend_scaled", "quarter_scaled"])

# Dodaj znormalizowane cechy do danych
combined_data = pd.concat([combined_data.reset_index(drop=True), scaled_features_df], axis=1)

# Wybierz tylko nowe dane testowe
new_test_data = combined_data[combined_data["num_sold"].isna()]
new_test_data = new_test_data.drop_duplicates(subset=["id"])
# Przygotuj cechy do predykcji
X_new_test = new_test_data[["year_scaled", "month_scaled", "day_of_week_scaled", "is_weekend_scaled", "quarter_scaled", "country_encoded", "store_encoded", "product_encoded"]]
X_new_test = X_new_test.loc[:, ~X_new_test.columns.duplicated()]

# Dokonaj predykcji
new_test_data["num_sold"] = model.predict(X_new_test)

# Wynik
predicted_values = new_test_data[["id", "date", "country", "store", "product", "num_sold"]]
predicted_values.to_csv("predicted_sales.csv", index=False)


In [20]:
df = pd.read_csv('predicted_sales.csv')
df = df[['id','num_sold']]
df.to_csv("first_test.csv", index=False)

In [22]:
df

Unnamed: 0,id,num_sold
0,230130,542.31537
1,230220,542.31537
2,230310,542.31537
3,230400,542.31537
4,230490,542.31537
...,...,...
98545,328314,1024.84220
98546,328404,1024.84220
98547,328494,1024.84220
98548,328584,1024.84220
