In [None]:
# This cell only needs to be run once
!pip install scikit-learn
!brew install libomp
!pip install lightgbm

In [None]:
# This cell needs to be run everytime the notebook is opened
import lightgbm as lgb
import sklearn as sklearn
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# Load Data
df = pd.read_csv("Q3_a.csv", parse_dates=["business_date"])

# Aggregate sales per day per venue
df = df.groupby(["business_date", "venue_xref_id"])["sales_revenue_with_tax"].sum().reset_index()

# Feature Engineering
df["business_date"] = pd.to_datetime(df["business_date"])
df["day_of_week"] = df["business_date"].dt.dayofweek  # Monday = 0, Sunday = 6
df["prev_day_sales"] = df.groupby("venue_xref_id")["sales_revenue_with_tax"].shift(1)
df.dropna(inplace=True)

### Part (a): Seasonality attributes

In [4]:
# Train/Test Split
X = df[["day_of_week", "prev_day_sales"]]
y = df["sales_revenue_with_tax"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [6]:
# Train LightGBM
model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1)
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000325 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 262
[LightGBM] [Info] Number of data points in the train set: 73411, number of used features: 2
[LightGBM] [Info] Start training from score 3578.175910


In [10]:
# This is a sample of how the input data is formatted
print(X_test)

       day_of_week  prev_day_sales
74011            4         3131.10
74012            4         1086.05
74013            4          947.75
74014            4          971.37
74015            4         1487.16
...            ...             ...
92360            1         1647.50
92361            1          699.01
92362            1         1405.70
92363            1          555.00
92364            1         1648.68

[18353 rows x 2 columns]


In [None]:
# This is how the model can be used to get the output. 
y_pred = model.predict(X_test)
print(y_pred)
#Once you get the output array, you can use the input array adn output array to make visualizations.

[4689.68501309 1763.51389503 1617.46126185 ... 1485.74855663  792.71763318
 1676.20027187]


In [None]:
import ipywidgets as widgets
from IPython.display import display

# Date Pickers for Start and End Date
start_date = widgets.DatePicker(description="Start Date:")
end_date = widgets.DatePicker(description="End Date:")

# Function to capture date selection and plot predictions
def on_date_selected(change):
    if start_date.value and end_date.value:
        start = pd.to_datetime(start_date.value)
        end = pd.to_datetime(end_date.value)
        
        # Filter data for the selected date range
        filtered_df = df[(df["business_date"] >= start) & (df["business_date"] <= end)]
        
        if filtered_df.empty:
            print("No data available for the selected range.")
            return
        
        # Prepare input for the model
        X_selected = filtered_df[["day_of_week", "prev_day_sales"]]
        y_actual = filtered_df["sales_revenue_with_tax"]
        
        # Get predictions
        y_pred = model.predict(X_selected)
        
        # Plot actual vs predicted sales
        plt.figure(figsize=(10, 5))
        plt.plot(filtered_df["business_date"], y_actual, label="Actual Sales", marker='o')
        plt.plot(filtered_df["business_date"], y_pred, label="Predicted Sales", marker='x', linestyle='dashed')
        plt.xlabel("Date")
        plt.ylabel("Sales Revenue")
        plt.legend()
        plt.title("Sales Prediction vs Actual Sales")
        plt.xticks(rotation=45)
        plt.grid()
        plt.show()

# Attach event listeners
start_date.observe(on_date_selected, names="value")
end_date.observe(on_date_selected, names="value")

# Display widgets
display(start_date, end_date)