### Time series model to run Prophet
- Objectives: To run a prediction on the uploaded dataset, and append the predictions to the df. Function should be consolidated into a single python function (.py file)

In [None]:
import pandas as pd
from prophet import Prophet
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from flask import jsonify

df = pd.read_csv('final_mock_data.csv')
df['Date'] = pd.to_datetime(df['Date'])
df.head(3)

Unnamed: 0,Date,campaign_id,channel,age_group,ad_spend,views,leads,new_accounts,country,revenue
0,2022-01-02,January_2022_1,Influencer,18-24,458,13262.086957,265.241739,53.048348,Singapore,597.391304
1,2022-01-02,January_2022_1,Influencer,18-24,231,6688.956522,133.77913,26.755826,Malaysia,301.304348
2,2022-01-02,January_2022_1,Influencer,18-24,271,7847.217391,156.944348,31.38887,Indonesia,353.478261


### Combined function

In [112]:
def prophet(from_date, to_date):
    
    df = pd.read_csv('final_mock_data.csv')
    df['Date'] = pd.to_datetime(df['Date'])
    df = df[df['Date'].isin(pd.date_range(start=from_date, end=to_date))]
    time_period = 4
    # takes in df_grouped, makes df_plotting

    df_grouped = df.resample("M", on='Date').agg({
        'ad_spend': 'sum',
        "new_accounts": "sum",
        "revenue": "sum",
        }).reset_index()

    df_plotting = df_grouped[['Date', 'revenue', 'ad_spend', 'new_accounts']]
    df_plotting.rename(columns={'Date': 'ds'}, inplace=True)

    def prophet_forecast(df_plotting, df_grouped, predict_col, time_period):

        # Prepare data for Prophet
        df_prophet = df_grouped.rename(columns={'Date': 'ds', predict_col: 'y'})
        df_grouped.rename(columns={'Date': 'ds'}, inplace=True) #for subsequent merging

        # Initialize Prophet model
        model = Prophet()
        model.fit(df_prophet)

        # Create future dataframe for prediction
        future = model.make_future_dataframe(periods=time_period, freq='M')

        # Make predictions
        forecast = model.predict(future)
        forecast_snippet = forecast[['ds', 'yhat']][-time_period:]

        # Add future months to df_plotting
        last_date = df_grouped['ds'].max()
        future_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=time_period, freq='M')
        future_df = pd.DataFrame({'ds': future_dates})

        # Only add empty values for columns that don't already exist
        for col in ['revenue', 'ad_spend', 'new_accounts']:
            if col not in future_df.columns:
                future_df[col] = np.nan

        df_plotting = pd.concat([df_plotting, future_df], ignore_index=True)

        # Update predictions in df_plotting
        for idx, row in forecast_snippet.iterrows():
            df_plotting.loc[df_plotting['ds'] == row['ds'], predict_col] = row['yhat']

        return df_plotting

    # Get individual forecasts
    df_plotting_rev = prophet_forecast(df_plotting,df_grouped, 'revenue', 4)
    df_plotting_ad = prophet_forecast(df_plotting,df_grouped, 'ad_spend', 4)
    df_plotting_accounts = prophet_forecast(df_plotting,df_grouped, 'new_accounts', 4)

    # Combine the forecasts
    df_plotting_combined = pd.DataFrame()
    df_plotting_combined['ds'] = df_plotting_rev['ds']
    df_plotting_combined['revenue'] = df_plotting_rev['revenue'] 
    df_plotting_combined['ad_spend'] = df_plotting_ad['ad_spend']
    df_plotting_combined['new_accounts'] = df_plotting_accounts['new_accounts']

    df_plotting = df_plotting_combined

    return df_plotting

df2 = prophet('2022-01-01', '2024-09-30')
print(df2)


  df_grouped = df.resample("M", on='Date').agg({
16:12:00 - cmdstanpy - INFO - Chain [1] start processing
16:12:00 - cmdstanpy - INFO - Chain [1] done processing
  dates = pd.date_range(
  future_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=time_period, freq='M')
16:12:00 - cmdstanpy - INFO - Chain [1] start processing
16:12:00 - cmdstanpy - INFO - Chain [1] done processing
  dates = pd.date_range(
  future_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=time_period, freq='M')
16:12:00 - cmdstanpy - INFO - Chain [1] start processing
16:12:00 - cmdstanpy - INFO - Chain [1] done processing


           ds       revenue       ad_spend   new_accounts
0  2022-01-31  9.912437e+05  716465.000000   88022.442794
1  2022-02-28  7.855560e+05  567633.000000   69757.373214
2  2022-03-31  8.092905e+05  585829.000000   71864.997089
3  2022-04-30  8.495003e+05  570309.000000   75435.628571
4  2022-05-31  1.085712e+06  728958.000000   96411.181460
5  2022-06-30  8.745030e+05  587230.000000   77655.865043
6  2022-07-31  1.193128e+06  729732.000000  105949.723351
7  2022-08-31  9.460965e+05  578979.000000   84013.371068
8  2022-09-30  9.540424e+05  584161.000000   84718.968835
9  2022-10-31  1.375859e+06  725155.000000  122176.272605
10 2022-11-30  1.090929e+06  576348.000000   96874.482531
11 2022-12-31  1.106950e+06  582228.000000   98297.203554
12 2023-01-31  1.303748e+06  721973.000000  115772.821596
13 2023-02-28  1.009867e+06  561171.000000   89676.192618
14 2023-03-31  1.034901e+06  574177.000000   91899.199755
15 2023-04-30  1.276257e+06  713323.000000  113331.611681
16 2023-05-31 

  dates = pd.date_range(
  future_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=time_period, freq='M')


### Walk forward validation to test Prophet

In [None]:
# Define walk-forward validation parameters
initial_train_size = 12  # Use the first 12 months for initial training
test_window = 1          # Predict 1 month ahead in each iteration

mape_list = []  # Store MAPE for each step
rmse_list = []  # Store RMSE for each step
actuals = []
predictions = []

df_prophet = df_grouped.rename(columns={'Date': 'ds', 'revenue': 'y'}) #prediction by months

# Walk-forward loop
for i in range(initial_train_size, len(df_prophet) - test_window + 1):
    train = df_prophet.iloc[:i]  # Expand training set
    test = df_prophet.iloc[i:i + test_window]  # Next time step
    
    # Train Prophet model
    model = Prophet()
    model.add_regressor('ad_spend') 
    model.fit(train)
    
    # Prepare future dataframe
    future = test[['ds', 'ad_spend']]
    
    # Predict
    forecast = model.predict(future)
    y_pred = forecast['yhat'].values[0]
    y_true = test['y'].values[0]
    
    # Store actual and predicted values
    actuals.append(y_true)
    predictions.append(y_pred)
    
    # Compute RMSE
    rmse = np.sqrt(mean_squared_error([y_true], [y_pred]))
    rmse_list.append(rmse)

    # Compute MAPE (avoid division by zero)
    if y_true != 0:
        mape = np.abs((y_true - y_pred) / y_true) * 100
    else:
        mape = np.nan  # Ignore cases where actual value is zero

    mape_list.append(mape)

# Calculate average RMSE
avg_rmse = np.mean(rmse_list)
avg_mape = np.nanmean(mape_list)  # Ignore NaN values in MAPE

# Print results
print(f"Average RMSE: {avg_rmse:.2f}")
print(f"Average MAPE: {avg_mape:.2f}%") #14.32% with ad_spend, 22.22% without ad_spend

13:49:57 - cmdstanpy - INFO - Chain [1] start processing
13:49:57 - cmdstanpy - INFO - Chain [1] done processing
13:49:57 - cmdstanpy - INFO - Chain [1] start processing
13:49:57 - cmdstanpy - INFO - Chain [1] done processing
13:49:57 - cmdstanpy - INFO - Chain [1] start processing
13:49:57 - cmdstanpy - INFO - Chain [1] done processing
13:49:57 - cmdstanpy - INFO - Chain [1] start processing
13:49:57 - cmdstanpy - INFO - Chain [1] done processing
13:49:57 - cmdstanpy - INFO - Chain [1] start processing
13:49:57 - cmdstanpy - INFO - Chain [1] done processing
13:49:57 - cmdstanpy - INFO - Chain [1] start processing
13:49:57 - cmdstanpy - INFO - Chain [1] done processing
13:49:57 - cmdstanpy - INFO - Chain [1] start processing
13:49:57 - cmdstanpy - INFO - Chain [1] done processing
13:49:57 - cmdstanpy - INFO - Chain [1] start processing
13:49:57 - cmdstanpy - INFO - Chain [1] done processing
13:49:57 - cmdstanpy - INFO - Chain [1] start processing
13:49:57 - cmdstanpy - INFO - Chain [1]

Average RMSE: 141551.26
Average MAPE: 14.32%


### Transformation of results for plotting
- retain: Date, ad_spend, revenue

In [5]:
# apply prophet on entire dataset, predicting 2 months ahead, append into a df
future = model.make_future_dataframe(periods=2, freq='M')
forecast = model.predict(future)

# merge with original df, and print df


  dates = pd.date_range(


ValueError: Regressor 'ad_spend' missing from dataframe