<font size= '10'><b>Load the data</font></b>

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from datetime import datetime, timedelta
# Load the merged cleaned data from Task 1
data = pd.read_csv('C:/Users/User/Desktop/Week-4/notebooks/cleaned_data.csv') 
print(data.columns)


Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoInterval', 'Year', 'Month', 'Day'],
      dtype='object')


<font size= '10'><b>Convert Date to datetime</font></b>

In [8]:
data['Date'] = pd.to_datetime(data['Date'])

data['Weekday'] = data['Date'].dt.weekday
data['Weekend'] = (data['Weekday'] >= 5).astype(int)  # 1 if weekend, 0 if weekday
data['DayOfMonth'] = data['Date'].dt.day
data['Is_Beginning_Month'] = (data['DayOfMonth'] <= 10).astype(int)
data['Is_Mid_Month'] = ((data['DayOfMonth'] > 10) & (data['DayOfMonth'] <= 20)).astype(int)
data['Is_End_Month'] = (data['DayOfMonth'] > 20).astype(int)

data.fillna(0, inplace=True)

numeric_features = ['Open', 'Promo', 'CompetitionDistance', 'DayOfMonth', 'Weekday']
categorical_features = ['Store', 'StateHoliday']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [4]:
print(X_train.columns)


Index(['Store', 'DayOfWeek', 'Open', 'Promo', 'StateHoliday',
       'CompetitionDistance', 'Year', 'Month', 'Day'],
      dtype='object')


<font size= '6'><b>Building Models with Sklearn Pipelines</font></b>

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Define features and target
features = numeric_features + categorical_features
X = data[features]
y = data['Sales']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Fit the model
pipeline.fit(X_train, y_train)


<font size= '6'><b>Choosing a Loss Function</font></b>

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'MSE: {mse}, MAE: {mae}')

<font size= '6'><b>Post Prediction Analysis</font></b>

In [None]:
import matplotlib.pyplot as plt

# Get feature importances
importances = pipeline.named_steps['model'].feature_importances_
feature_names = numeric_features + list(pipeline.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_features))

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_names, importances)
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.show()

<font size= '6'><b>Serialize Models</font></b>

In [None]:
import joblib
import datetime

# Get current timestamp
timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
filename = f"model_{timestamp}.pkl"

# Save the model
joblib.dump(pipeline, filename)
