In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/playground-series-s5e1/sample_submission.csv
/kaggle/input/playground-series-s5e1/train.csv
/kaggle/input/playground-series-s5e1/test.csv


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_percentage_error

# Load the data
train_data = pd.read_csv('/kaggle/input/playground-series-s5e1/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s5e1/test.csv')

# Handle missing values in the target column 'num_sold'
train_data = train_data.dropna(subset=['num_sold'])

# Convert 'date' column to datetime format
train_data['date'] = pd.to_datetime(train_data['date'])
test_data['date'] = pd.to_datetime(test_data['date'])

# Extract date features
for df in [train_data, test_data]:
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['weekday'] = df['date'].dt.weekday

# Drop the original 'date' column
train_data = train_data.drop(columns=['date'])
test_data = test_data.drop(columns=['date'])

# Separate features and target variable
X = train_data.drop(columns=['num_sold'])
y = train_data['num_sold']

# Align train and test datasets
combined_data = pd.concat([X, test_data], axis=0)

# One-hot encode categorical features
categorical_columns = ['country', 'store', 'product']
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe_features = ohe.fit_transform(combined_data[categorical_columns])

# Create a DataFrame for the encoded features
encoded_feature_names = ohe.get_feature_names_out(categorical_columns)
ohe_df = pd.DataFrame(ohe_features, columns=encoded_feature_names, index=combined_data.index)

# Concatenate encoded features with the original dataset
combined_data = pd.concat([combined_data.drop(columns=categorical_columns), ohe_df], axis=1)

# Split combined data back into train and test sets
X = combined_data.iloc[:len(X)]
test_features = combined_data.iloc[len(X):]

# Split the training data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)

# Validation
val_predictions = model.predict(X_val)
val_mape = mean_absolute_percentage_error(y_val, val_predictions)
print(f"Validation MAPE: {val_mape}")

# Make predictions on the test set
test_predictions = model.predict(test_features)

# Prepare submission file
test_data['num_sold'] = test_predictions
submission = test_data[['id', 'num_sold']]
submission.to_csv('submission.csv', index=False)
print("Submission file created.")


Validation MAPE: 0.7784617036751882
Submission file created.
