In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
# Load train and test datasets
train_df = pd.read_csv('train.csv', sep=';')
test_df = pd.read_csv('test.csv', sep=';')

In [None]:
# Display the size of the training dataset and check for null values
print(f"Training dataset size: {len(train_df)}")
print(f"Number of null values in 'num_sold': {train_df['num_sold'].isnull().sum()}")

Training dataset size: 230130
Number of null values in 'num_sold': 8871


In [None]:
# Calculate average sales per store and product, and fill missing values with these averages
grouped_means = train_df.groupby(['store', 'product'])['num_sold'].mean().reset_index()
grouped_means.rename(columns={'num_sold': 'mean_num_sold'}, inplace=True)
train_df = pd.merge(train_df, grouped_means, on=['store', 'product'], how='left')
train_df['num_sold'] = train_df['num_sold'].fillna(train_df['mean_num_sold'])
train_df = train_df.drop(columns=['mean_num_sold'])

In [None]:
# Convert the 'date' column to datetime format and extract year, month, and day
for df in [train_df, test_df]:
    df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df.drop(columns=['date'], inplace=True)

In [None]:
# Apply one-hot encoding to categorical columns
categorical_columns = ['country', 'store', 'product']
train_df = pd.get_dummies(train_df, columns=categorical_columns)
test_df = pd.get_dummies(test_df, columns=categorical_columns)

In [None]:
# Align test dataset with training dataset columns
test_df = test_df.reindex(columns=train_df.columns, fill_value=0)
test_df['num_sold'] = 0

In [None]:
# Split features and target variable
X = train_df.drop(columns=['num_sold'])
y = train_df['num_sold']
X_test = test_df.drop(columns=['num_sold'])

In [None]:
# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define RandomForestRegressor hyperparameter grid
rf_params = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False],
}

In [None]:
# Perform RandomizedSearchCV for hyperparameter tuning
rf = RandomForestRegressor(random_state=42)
rf_cv = RandomizedSearchCV(rf, rf_params, cv=5, n_jobs=-1, verbose=2, random_state=42)
rf_cv.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits




In [None]:
# Display best parameters and score
print("Best parameters:", rf_cv.best_params_)
print("Best score (validation):", rf_cv.best_score_)

Best parameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 20, 'bootstrap': True}
Best score (validation): 0.9761107653256879


In [None]:
# Train the final model with the best parameters
best_rf = rf_cv.best_estimator_

In [None]:
# Evaluate the model on the validation set
y_val_pred = best_rf.predict(X_val)
val_mape = mean_absolute_percentage_error(y_val, y_val_pred)
print(f"Validation MAPE: {val_mape:.2%}")

Validation MAPE: 15.29%


In [None]:
# Make predictions on the test dataset
y_test_pred = best_rf.predict(X_test)
y_test_pred_int = y_test_pred.astype(int)  # Convert predictions to integers

In [None]:
# Add predictions to the test DataFrame
test_df['num_sold'] = y_test_pred_int

In [None]:
# Save the predictions to a CSV file
output_file = "test_with_predictions_rf.csv"
test_df.to_csv(output_file, index=False)
print(f"Predictions saved to '{output_file}'")

Predictions saved to 'test_with_predictions_rf.csv'
