## Store Sales - Time Series Forecasting

In [None]:
#impport Libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns 
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Read available data sets
df1 = pd.read_csv(r"/kaggle/input/store-sales-time-series-forecasting/train.csv")
df2 = pd.read_csv(r"/kaggle/input/store-sales-time-series-forecasting/test.csv")
df3 = pd.read_csv(r"/kaggle/input/store-sales-time-series-forecasting/stores.csv")
df4 = pd.read_csv(r"/kaggle/input/store-sales-time-series-forecasting/transactions.csv")
df5 = pd.read_csv(r"/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv")
df6 = pd.read_csv(r"/kaggle/input/store-sales-time-series-forecasting/oil.csv")

In [None]:
df1.head()

In [None]:
# data information
df1.info()

In [None]:
df1.describe()

In [None]:
df1.store_nbr.value_counts()

### Data visualization and processing

In [None]:
sns.lineplot(x="store_nbr", y="sales",
             data=df1)

In [None]:
# Plot the responses for different events and regions
sns.lineplot(x="store_nbr", y="sales",
             hue="family",
             data=df1[10000:10300])
plt.show()

In [None]:
sns.lineplot(data=df6, palette="tab10", linewidth=2.5)

In [None]:
sns.lineplot(data=df4, palette="tab10", linewidth=2.5)

In [None]:
sns.distplot(df1['sales'])
plt.show()

In [None]:
df=pd.concat([df1,df2])
df.drop(columns='id',inplace=True)

In [None]:
df['date'] = pd.to_datetime(df['date'])
categorical_cols = ['store_nbr', 'family']

for col in categorical_cols:
    print(f"Unique values in {col}: {df[col].nunique()}")

In [None]:
# Feature Engineering
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['day_of_week'] = df['date'].dt.dayofweek
df['weekend'] = df['day_of_week'].isin([5, 6]).astype(int) 

In [None]:
# Data Visualization on sales
plt.figure(figsize=(8, 4))
plt.plot(df['date'], df['sales'], label='Sales')
plt.title('Sales Trends Over Time')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend()
plt.show()

In [None]:
plt.scatter(df['onpromotion'],df['sales'],alpha=0.5)
plt.show()

In [None]:
p=df.groupby('month')['sales'].mean()
p.plot(kind='bar')
plt.title('sales by month')

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

In [None]:
df.drop('date',axis=1,inplace=True)

In [None]:
lab = LabelEncoder()

In [None]:
df.family = lab.fit_transform(df.family)
df 

In [None]:
sns.pairplot(df.iloc[:,1:5])

### Split data for testing and training

In [None]:
x = df.drop(['sales'],axis=1)
y = df['sales']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.25, random_state=20)

In [None]:
#We create a function that makes it easy for us to train and test any model
def kfolds(model, model_name):
    model = cross_val_score(model, x,y, cv=10)
    model_score = np.average(model)
    print(f"{model_name} score on cross validation: {model_score * 100}%")
def model_train(model,model_name):
    model.fit(X_train,Y_train)
    model_train_score = model.score(X_train, Y_train)
    model_test_score = model.score(X_test, Y_test)
    print(f"{model_name} model score on Train= {model_train_score *100}%\n{model_name} model score on Test= {model_test_score*100}%")

In [None]:
model = LinearRegression()
model_train(model,'LinearRegression')
kfolds(model,'LinearRegression')

In [None]:
model_XG = XGBRegressor()
model_train(model_XG, "XGBRegressor")
kfolds(model_XG, "XGBRegressor")

In [None]:
model_RFR = DecisionTreeRegressor()
model_train(model_RFR,'DecisionTreeRegressor')
kfolds(model_RFR,'DecisionTreeRegressor')

In [None]:
mode_ploy = Ridge()
model_train(mode_ploy,'PolynomialFeatures')
kfolds(mode_ploy,'PolynomialFeatures')

### We note that the best models are XGBRegressor

In [None]:
pred_xgb = model_XG.predict(X_test)

In [None]:
pred_xgb

In [None]:
from sklearn.metrics import mean_squared_error

### Save prediction results to csv file

In [None]:
mse_xgb = mean_squared_error(Y_test, pred_xgb)
xgb_rmse = np.sqrt(mse_xgb)
print("rmse_xgb:", xgb_rmse)

submission = pd.DataFrame({'sales': pred_xgb})
print(submission)
submission.to_csv('submission1s.csv', index=False)