In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

train = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv")
test = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv")
transactions = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv')
stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')
holidays_events = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv')

In [None]:
imputer = SimpleImputer(strategy='mean')

In [None]:
oil.head()

In [None]:
display(train.head())

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
display(test.head())

In [None]:
test.info()

In [None]:
test.isnull().sum()

In [None]:
train = pd.merge(train, stores, how='left', on='store_nbr')
train = pd.merge(train, transactions, how='left', on=['date', 'store_nbr'])
train = pd.merge(train, oil, how='left', on='date')
train = pd.merge(train, holidays_events, how='left', on='date')

test = pd.merge(test, stores, how='left', on='store_nbr')
test = pd.merge(test, transactions, how='left', on=['date', 'store_nbr'])
test = pd.merge(test, oil, how='left', on='date')
test = pd.merge(test, holidays_events, how='left', on='date')

In [None]:
# Feature engineering - Example: day of the week
train['day_of_week'] = pd.to_datetime(train['date']).dt.dayofweek
test['day_of_week'] = pd.to_datetime(test['date']).dt.dayofweek

In [None]:
train.head()

In [None]:
test.head()

In [None]:
plt.plot(train['date'],train['onpromotion'])
plt.xlabel('date')
plt.ylabel('onpromo')
plt.show()

In [None]:
train['log_sales'] = np.log1p(train['sales'])

In [None]:
plt.plot(train['date'],train['onpromotion'])
plt.xlabel('date')
plt.ylabel('onpromo')
plt.show()

In [None]:
plt.scatter(train['day_of_week'],train['onpromotion'])
plt.show()

In [None]:
features = ['store_nbr', 'onpromotion', 'day_of_week', 'cluster', 'dcoilwtico']

In [None]:
X_train = pd.get_dummies(train[features])
y_train = train['log_sales']

X_train = X_train.dropna()
y_train = y_train[X_train.index]  
X_test = pd.get_dummies(test[features])

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
model = GradientBoostingRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_predict = model.predict(X_valid)

rmsle = np.sqrt(mean_squared_error(y_valid, y_predict))

In [None]:
print(f'Root Mean Squared Logarithmic Erroron validation set: {rmsle}')

In [None]:
features_train = ['onpromotion', 'cluster', 'dcoilwtico', 'store_nbr', 'day_of_week']

X_Test_encoded = pd.get_dummies(X_test)

missing_cols = set(X_train.columns) - set(X_Test_encoded.columns)
for col in missing_cols:
    X_Test_encoded[col] = 0

In [None]:
X_Test_encoded = X_Test_encoded[X_train.columns]
print(X_Test_encoded.isnull().sum())

In [None]:
X_Test_encoded = pd.DataFrame(imputer.fit_transform(X_Test_encoded), columns=X_Test_encoded.columns)
print(X_Test_encoded.isnull().sum())

In [None]:
features_test = ['store_nbr', 'onpromotion', 'day_of_week', 'cluster', 'dcoilwtico']
X_Test = X_test[features]

# Predictions on the test set
y_predict_test = model.predict(X_Test_encoded)

In [None]:
submission = pd.DataFrame({
    'id': test['id'],
    'sales': y_predict_test
})
submission.to_csv('s.csv', index=False)

In [None]:
display(submission.head())