# <center>AdEase Case Study</center>

# Introduction
- AdEase is an ads and marketing-based company helping businesses elicit maximum clicks @ minimum cost.
- AdEase is an ad infrastructure to help businesses promote themselves easily, effectively, and economically
- AdEase is trying to understand the per page view report for different wikipedia
pages for 550 days, and forecasting the number of views so that you can predict and optimize the ad placement for your clients.
- By leveraging data science and time series, Ad Ease can forecast page visits for different languages.

# What is expected?
- You are working in the Data Science team of Ad ease trying to understand the per page view report for different wikipedia pages for 550 days, and forecasting the number of views so that you can predict and optimize the ad placement for your clients. You are provided with the data of 145k wikipedia pages and daily view count for each of them. Your clients belong to different regions and need data on how their ads will perform on pages in different languages.

## 1. Data Ingestion

- Read data from gdrive

In [None]:
import os
import gdown
import zipfile

# file_id = "1AbCDEfGhIJklMNopQRstuVWxyz12345"
# output_path = "train_1.csv"          # rename if needed
# gdown.download(f"https://drive.google.com/uc?id={file_id}", output_path, quiet=False)

zip_id = "11uLnI8MB1BSMzzI4ox1jK7jbAwsxdbqo"
zip_path = "train_1.zip"

# Download the zip only if it doesn't already exist
if not os.path.exists(zip_path):
    gdown.download(f"https://drive.google.com/uc?id={zip_id}", zip_path, quiet=False)
else:
    print(f"{zip_path} already exists. Skipping download.")

# Extract directly into current working directory (no subfolder)
# Skip extraction if the expected main file already exists
expected_file = "train_1.csv"
if not os.path.exists(expected_file):
    with zipfile.ZipFile(zip_path) as z:
        z.extractall(path=".")
    print("Extraction complete to current directory.")
else:
    print(f"{expected_file} already present. Skipping extraction.")

## 2.Libraries
Required Libraries

In [None]:
# libraries to analyze data
import numpy as np
import pandas as pd

# libraries to visualize data
import matplotlib.pyplot as plt
import seaborn as sns

import re

import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf

from sklearn.metrics import (
    mean_squared_error as mse,
    mean_absolute_error as mae,
    mean_absolute_percentage_error as mape
)


from statsmodels.tsa.arima.model import ARIMA

## 3. Import Data

In [None]:
# read the file into a pandas dataframe
df = pd.read_csv('train_1.csv')
# look at the datatypes of the columns
print('*************************************************')
print(df.info())
print('*************************************************\n')
print('*************************************************')
print(f'Shape of the dataset is {df.shape}')
print('*************************************************\n')
print('*************************************************')
print(f'Number of nan/null values in each column: \n{df.isna().sum()}')
print('*************************************************\n')


In [None]:
print(f'Number of unique values in each column: \n{df.nunique()}')
print('*************************************************\n')
print('*************************************************')
print(f'Duplicate entries: \n{df.duplicated().value_counts()}')

In [None]:
df.head(20)

In [None]:
df.describe()

In [None]:
df.describe(include='object')

### Observation
- There are **145063** entries with 551 columns,
- Which means there are 145063 wikipedia pages with views for 550 days
- There are null/missing values in each of the dates
- But there are no **duplicates**
- There are **145063** unique wikipedia pages

 reading Exog_Campaign_eng file containing flag for each date indicating 
 if those dates had a campaign/significant event which could have influenced
 the page views

In [None]:
file_id = "1GvWoXIxe1RaMWMSp1nNOw46Nxh_7vdzE"
output_path = "Exog_Campaign_eng"          # rename if needed
gdown.download(f"https://drive.google.com/uc?id={file_id}", output_path, quiet=False)

In [None]:

exog_en = pd.read_csv('Exog_Campaign_eng')
# look at the datatypes of the columns
print('*************************************************')
print(exog_en.info())
print('*************************************************\n')
print('*************************************************')
print(f'Shape of the dataset is {exog_en.shape}')
print('*************************************************\n')
print('*************************************************')
print(f'Number of nan/null values in each column: \n{exog_en.isna().sum()}')
print('*************************************************\n')
print('*************************************************')
print(f'Number of unique values in each column: \n{exog_en.nunique()}')
print('*************************************************\n')
print('*************************************************')
print(f'Duplicate entries: \n{exog_en.duplicated().value_counts()}')

In [None]:
exog_en.head()

### Observation
- For every **550** entries in **Exog_Campaign_eng** there are corresponding 550 days in the **train_1.csv** dataset
- **No** null/missing values
- **2** unique values - 1 ans 0

## 4. EDA

### 4.1 Date Columns

In [None]:
data_columns = df.columns[1:]
df[data_columns].isna().sum().plot(figsize=(12,6))
plt.show()

### Observation
- The null values are keep decreasing with dates(time)
- We can infer that pages which are launched recently will not have views prior to launch
- We can fill those values with zeros.

In [None]:
df[data_columns] = df[data_columns].fillna(0)

In [None]:
df.isna().sum()

### 4.2 Extract information from page column

like
- page name
- Language
- domain
- Device type used to access data
- access origin

### 4.2.Extracting Page name from page column

In [None]:
df.Page.sample(10)

The page column contains data in the below format: \
**SPECIFIC NAME _ LANGUAGE.wikipedia.org _ ACCESS TYPE _ ACCESS ORIGIN** \
having information about page name, the domain, device type used to access the
page, aso the request origin(spider or browser age
2.)

In [None]:
# def extract_name(page):
#     pattern = r'(.{0,})_(.{2}).wikipedia.org_'
#     result = re.findall(pattern, page)
#     if len(result) == 1:
#         return result[0][0]
#     else:
#         return 'unknown'
# df['name'] = df['Page'].apply(extract_name)
# df[['Page', 'name']].head(10)

**Why we commented above code?**
- The above code findall tries to scan entire page name and lists with similar format
- But re.search only returns the first entry which would be sufficient and fast

In [None]:
def extract_page_name(page):
    try:
        return re.search(r'^(.*?)_', page).group(1)
    except:
        return page
    
df['name'] = df.Page.apply(extract_page_name)
df[['Page', 'name']].head(10)

### 4.2.2 Extracting Language from Page column

re.search(r'_\w{2}\.wikipedia\.org')

In [None]:
def extract_langugage(page):
    try:
        return re.search(r'_(\w{2})\.wikipedia\.org', page).group(1)
    except:
        return 'un'
df['language'] = df.Page.apply(extract_langugage)
print(df['language'].unique())

In [None]:
df.head(10)

In [None]:
language_name_mapping ={
    'zh': 'Chinese',
    'fr': 'French',
    'en': 'English',
    'un': 'unknown',
    'ru': 'Russian',
    'de': 'German',
    'ja': 'Japanese',
    'es': 'Spanish'
}
df['language'] = df['language'].map(language_name_mapping)
df['language'].value_counts().plot(kind='bar', title='Number of pages by language')
plt.show()

In [None]:
## % pages of different languages
round(df['language'].value_counts(normalize=True)*100, 2)

### Observation
- Maximum number of pages are in English with 16.62%
- Followed by Japanese with 14.08%

### 4.2.3 Extracting access type

In [None]:
def extract_accessType(page):
    try:
        pattern = r'all-access|mobile-web|desktop'
        return re.search(pattern, page).group(0)
    except:
        return 'un'
df['access_type'] =df.Page.apply(extract_accessType)
df['access_type'].value_counts().plot(kind='pie', autopct='%1.1f%%', title='% of pages with diffrent access type')

### Observation
- Nearly half of the pages have all access
- Rest half are either accessible on mobile or desktop with almost equal percentage

### 4.2.4 Extracting access origin

In [None]:
df['access_origin'] = df['Page'].str.findall('spider|agents').apply(lambda x: x[0])
df['access_origin'].value_counts().plot(kind='pie', autopct='%1.1f%%', title='% of pages with diffrent access origin')
plt.show()

### Observations
- Most pages(75.9%) have **agents** as access origin

## 5.Aggregating and Pivoting

In [None]:
df.head(10)

**Aggregating on language by taking average views per language for each date**

In [None]:
df_agg = df.drop(columns=['Page', 'name', 'access_type', 'access_origin'])
df_agg = df_agg.groupby(['language']).mean().T.reset_index()
df_agg['index'] = pd.to_datetime(df_agg['index'])
df_agg =df_agg.set_index('index')
df_agg.head(10)

### 5.1 Time Series plot for all languages

In [None]:
df_agg.plot(figsize=(13,6), title='Average views per language over time')
plt.xlabel('Date')
plt.ylabel('Average Views')
plt.show()

### Observations:
- English pages are the most visited pages
- Followed by Spanish
- English pages have **upward trend**
- There is an **unusual peak** from **mid of July to end of August 2016** for **English** and **Russian** pages

## 6 Stationarity, Detrending, ACF and PACF plots

### 6.1 Stationarity Test

**Using Augmented Dickey-Fuller test to check for stationarity**
- H0: The series is not stationary
- H1: The series is stationary

In [None]:
def adfuller_test(timeseries):
    p_value = sm.tsa.stattools.adfuller(timeseries)[1]
    if p_value <= 0.05:
        print("Time series is stationary")
    else:
        print("Time series is non-stationary")

In [None]:
for language in df_agg.columns:
    print(f'ADF test for {language}:')
    adfuller_test(df_agg[language])
    print('-------------------------')

### Observations:
- Only **Spanish, Russian** page visits are **stationary**
- **Chinese, English, French, German and Japanese** page visits are **not stationary**.

Starting with **English**

In [None]:
english_ts = df_agg['English']

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
ax.plot(english_ts.index, english_ts)
ax.plot(english_ts.index, (exog_en + 1)*3000, ":") ## As english pages min mean is above 3000
plt.show()

### Observation:
- From above plot the ts looks like linear upward trend and linear seasonality
- Unusual spikes in page visits during the special events marked with orange peaks

### 6.2 De-Trending and De-seasoning

In [None]:
english_ts.diff(1).dropna().plot(figsize=(12, 3))
plt.show()

In [None]:
adfuller_test(english_ts.diff(1).dropna())

Series become stationary by doing first order diffrencing => **d = 1**

In [None]:
## Deseasoning
## check any small part of series
english_ts[50:130].plot(figsize=(12,2))
plt.show()
english_ts[130:210].plot(figsize=(12,2))
plt.show()

**Seasonality** is observed for every **7 days** ==> **s=7**

In [None]:
english_ts.diff(1).diff(7).dropna().plot(figsize=(12,3))
plt.show()

In [None]:
adfuller_test(english_ts.diff(1).diff(7).dropna())

As **Trend** and  **Seasonality** are removed manually, ADF test gives **time series is stationary**

### 6.3. Auto de-composition
Auto decomposition using statsmodel library to decompose time series

In [None]:
decom = seasonal_decompose(english_ts)
english_ts_trend = decom.trend
english_ts_seasonal = decom.seasonal
english_ts_res = decom.resid
plt.figure(figsize=(15,8))
plt.subplot(411)
plt.plot(english_ts, label = 'actual')
plt.legend()
plt.subplot(412)
plt.plot(english_ts_trend, label = 'trend')
plt.legend()
plt.subplot(413)
plt.plot(english_ts_seasonal, label = 'seasonal')
plt.legend()
plt.subplot(414)
plt.plot(english_ts_res, label = 'residual')
plt.legend()

### 6.4 ACF and PACF plots

In [None]:
fig, ax = plt.subplots(1,2,figsize=(12,3))
plot_acf(ax=ax[0], x=english_ts.diff(1).dropna())
plot_pacf(ax=ax[1], x=english_ts.diff(1).dropna())
plt.show()

> - From the PACF plot, we can see that there are 3 significant lags, at 5, 7 and 21. So **P=1,2 or 3**
> - From the ACF plot, we can see that there are 3 significant lags, at 7, 14 and 21. So **Q=1,2 or 3**
> - From the PACF plot, the cut-off is right from lag 0 and same for ACF plot. hence, **p** and **q =  0 or 1**

## 7. Model building and Evaluation

In [None]:
# Creating a function to print values of all these metrics.
def performance(actual, predicted, print_metrics=True):
    MAE = round(mae(actual, predicted), 3)
    RMSE = round(mse(actual, predicted)**0.5, 3)
    MAPE = round(mape(actual, predicted), 3)
    if(print_metrics==True):
        print('MAE :', MAE)
        print('RMSE :', RMSE)
        print('MAPE:', MAPE)
    return MAE, RMSE, MAPE

### 7.1 ARIMA model

In [None]:
timeSeries = english_ts.copy(deep=True)

In [None]:
n_forecast = 60
model = ARIMA(timeSeries[:-n_forecast], order=(0,1,0))
model = model.fit()
predicted = model.forecast(steps=n_forecast, alpha=0.05)
plt.figure(figsize=(12,5))
timeSeries.plot(label='Acutal')
predicted.plot(label='Forecast', linestyle='dashed', marker='.')
plt.legend(loc='upper right')
plt.show()

(_,_,_) = performance(timeSeries.values[-n_forecast:], predicted.values, print_metrics=True)

model is not doing good job even for diff comb of p and q

### 7.2 SARIMAX model

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [None]:
## let's try to include exogenous model
exog = exog_en['Exog'].to_numpy()
p,d,q,P,D,Q,S = 1,1,1,1,1,1,7
n_forecast = 60
model = SARIMAX(timeSeries[:-n_forecast], 
                order=(p,d,q), 
                seasonal_order=(P, D, Q, S), 
                exog= exog[:-n_forecast],
                initialization='approximate_diffuse'
                )
model = model.fit()
moder_forecast = model.forecast(steps=n_forecast, dynamic = True, exog = pd.DataFrame(exog[-n_forecast:]))

plt.figure(figsize=(20,8))
timeSeries[-100:].plot(label='Acutal')
moder_forecast[-100:].plot(label = 'Forecast', color = 'red', linestyle='dashed', marker='o',markerfacecolor='green')
plt.legend(loc='upper right')
plt.show()

(_,_,_) = performance(timeSeries.values[-n_forecast:], predicted.values, print_metrics=True)

### Observation
- SARIMAX model results are better, we need to do grid search to find the best params

In [None]:
def SARIMAX_search(timeSeries, forecast, p_list, d_list, q_list, P_list, D_list, Q_list, s_list, exog=[]):
    counter = 0
    perf_df = pd.DataFrame(columns=['serial', 'pdq', 'PDQs', 'mape', 'rmse'])

    for p in p_list:
        for d in d_list:
            for q in q_list:
                for P in P_list:
                    for D in D_list:
                        for Q in Q_list:
                            for s in s_list:
                                try:
                                    model = SARIMAX(timeSeries[:-n_forecast], 
                                                    order =(p,d,q), 
                                                    seasonal_order=(P, D, Q, s), 
                                                    exog = exog[:-n_forecast], 
                                                    initialization='approximate_diffuse'
                                                    )
                                    model = model.fit()
                                    model_forecast = model.forecast(n_forecast, dynamic = True, exog = pd.DataFrame(exog[-n_forecast:]))
                                    MAE, RMSE, MAPE = performance(timeSeries.values[-n_forecast:], model_forecast.values, print_metrics=False)
                                    counter += 1
                                    list_row = [counter, (p,d,q), (P,D,Q,s), MAPE, RMSE]
                                    perf_df.loc[len(perf_df)] = list_row
                                    print(f'Combination {counter} out of {(len(p_list)*len(d_list)*len(q_list)*len(P_list)*len(D_list)*len(Q_list)*len(s_list))}')
                                except:
                                    continue
    return perf_df

In [None]:
import warnings
warnings.filterwarnings("ignore")

timeSeries = english_ts.copy(deep=True)
n_forecast = 60
p_list = [0,1]
d_list = [1]
q_list = [0,1]
P_list = [2,3]
D_list = [1]
Q_list = [2,3]
s_list = [7]
exog = exog_en['Exog'].to_numpy()
perf_df = SARIMAX_search(timeSeries, n_forecast, p_list, d_list, q_list, P_list, D_list, Q_list, s_list, exog)
perf_df.sort_values(['mape', 'rmse'])

p,d,q,P,D,Q,s = 1,1,1,2,1,3,7 gives lowest mape

In [None]:
exog = exog_en['Exog'].to_numpy()
p,d,q,P,D,Q,S = 1,1,1,2,1,3,7
n_forecast = 60
model = SARIMAX(timeSeries[:-n_forecast], 
                order=(p,d,q), 
                seasonal_order=(P, D, Q, S), 
                exog= exog[:-n_forecast],
                initialization='approximate_diffuse'
                )
model = model.fit()
moder_forecast = model.forecast(steps=n_forecast, dynamic = True, exog = pd.DataFrame(exog[-n_forecast:]))

plt.figure(figsize=(20,8))
timeSeries[-100:].plot(label='Acutal')
moder_forecast[-100:].plot(label = 'Forecast', color = 'red', linestyle='dashed', marker='o',markerfacecolor='green')
plt.legend(loc='upper right')
plt.show()

(_,_,_) = performance(timeSeries.values[-n_forecast:], moder_forecast.values, print_metrics=True)

### Observation
- SARIMAX model has shown best results after tuning the parameters

### 7.4 Facebook Prophet

In [None]:
# Install required dependencies for Prophet
# %pip install cython
# %pip install prophet

In [None]:
timeSeries = english_ts.copy(deep=True).reset_index()
timeSeries = timeSeries[['index', 'English']]
timeSeries.columns = ['ds', 'y']
timeSeries['ds'] = pd.to_datetime(timeSeries['ds'])
exog = exog_en['Exog']
timeSeries['exog'] = exog.values
timeSeries.tail()

In [None]:
from prophet import Prophet
model = Prophet(interval_width=0.95, weekly_seasonality=True)
model.add_regressor('exog')
n_forecast = 60
model.fit(timeSeries)
forecast_dates = model.make_future_dataframe(periods=0)
forecast_dates['exog'] = timeSeries['exog']
forecast = model.predict(forecast_dates)


timeSeries['yhat'] = forecast['yhat']
timeSeries['yhat_upper'] = forecast['yhat_upper']
timeSeries['yhat_lower'] = forecast['yhat_lower']

(_,_,_) = performance(timeSeries['y'], timeSeries['yhat'], print_metrics=True)

In [None]:
plt.figure(figsize=(15, 5))
plt.plot(timeSeries['ds'], timeSeries['y'], label='Actual Visits', color='blue')
plt.plot(timeSeries['ds'], timeSeries['yhat'], label='Predicted Visits', color='red', alpha=0.8)
plt.fill_between(timeSeries['ds'], timeSeries['yhat_lower'], timeSeries['yhat_upper'], color='pink', alpha=0.3, label='Confidence Interval')

plt.xlabel('Date')
plt.ylabel('Number of Visits')
plt.title('Actual vs Predicted Visits')
plt.legend()
plt.show()

### Observation
- Prophet capturing more efficiently trend and unusual peak
- Even seasonality capturing is very well

### **7.5 Other Langugages**

### 7.5.1 Chinese

In [None]:
timeSeries = df_agg['Chinese'].copy(deep=True)
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(timeSeries.index, timeSeries)
plt.show()

timeSeries = timeSeries.reset_index()
timeSeries = timeSeries[['index', 'Chinese']]
timeSeries.columns = ['ds', 'y']
timeSeries['ds'] = pd.to_datetime(timeSeries['ds'])
timeSeries.tail()

model = Prophet(interval_width=0.95, weekly_seasonality=True)
model.fit(timeSeries)
forecast_dates = model.make_future_dataframe(periods=0)
forecast = model.predict(forecast_dates)

timeSeries['yhat'] = forecast['yhat']
timeSeries['yhat_upper'] = forecast['yhat_upper']
timeSeries['yhat_lower'] = forecast['yhat_lower']

(_,_,_) = performance(timeSeries['y'], timeSeries['yhat'], print_metrics=True)

# Plot actual vs predicted visits
plt.figure(figsize=(15, 5))
plt.plot(timeSeries['ds'], timeSeries['y'], label='Actual Visits', color='blue')
plt.plot(timeSeries['ds'], timeSeries['yhat'], label='Predicted Visits', color='red', alpha=0.8)
plt.fill_between(timeSeries['ds'], timeSeries['yhat_lower'], timeSeries['yhat_upper'], color='pink', alpha=0.3, label='Confidence Interval')

plt.xlabel('Date')
plt.ylabel('Number of Visits')
plt.title('Actual vs Predicted Visits')
plt.legend()
plt.show()

### 7.5.2 French

In [None]:
timeSeries = df_agg['French'].copy(deep=True)
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(timeSeries.index, timeSeries)
plt.show()

timeSeries = timeSeries.reset_index()
timeSeries = timeSeries[['index', 'French']]
timeSeries.columns = ['ds', 'y']
timeSeries['ds'] = pd.to_datetime(timeSeries['ds'])
timeSeries.tail()

model = Prophet(interval_width=0.95, weekly_seasonality=True)
model.fit(timeSeries)
forecast_dates = model.make_future_dataframe(periods=0)
forecast = model.predict(forecast_dates)

timeSeries['yhat'] = forecast['yhat']
timeSeries['yhat_upper'] = forecast['yhat_upper']
timeSeries['yhat_lower'] = forecast['yhat_lower']

(_,_,_) = performance(timeSeries['y'], timeSeries['yhat'], print_metrics=True)

# Plot actual vs predicted visits
plt.figure(figsize=(15, 5))
plt.plot(timeSeries['ds'], timeSeries['y'], label='Actual Visits', color='blue')
plt.plot(timeSeries['ds'], timeSeries['yhat'], label='Predicted Visits', color='red', alpha=0.8)
plt.fill_between(timeSeries['ds'], timeSeries['yhat_lower'], timeSeries['yhat_upper'], color='pink', alpha=0.3, label='Confidence Interval')

plt.xlabel('Date')
plt.ylabel('Number of Visits')
plt.title('Actual vs Predicted Visits')
plt.legend()
plt.show()

### 7.5.3 German

In [None]:
timeSeries = df_agg['German'].copy(deep=True)
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(timeSeries.index, timeSeries)
plt.show()

timeSeries = timeSeries.reset_index()
timeSeries = timeSeries[['index', 'German']]
timeSeries.columns = ['ds', 'y']
timeSeries['ds'] = pd.to_datetime(timeSeries['ds'])
timeSeries.tail()

model = Prophet(interval_width=0.95, weekly_seasonality=True)
model.fit(timeSeries)
forecast_dates = model.make_future_dataframe(periods=0)
forecast = model.predict(forecast_dates)

timeSeries['yhat'] = forecast['yhat']
timeSeries['yhat_upper'] = forecast['yhat_upper']
timeSeries['yhat_lower'] = forecast['yhat_lower']

(_,_,_) = performance(timeSeries['y'], timeSeries['yhat'], print_metrics=True)

# Plot actual vs predicted visits
plt.figure(figsize=(15, 5))
plt.plot(timeSeries['ds'], timeSeries['y'], label='Actual Visits', color='blue')
plt.plot(timeSeries['ds'], timeSeries['yhat'], label='Predicted Visits', color='red', alpha=0.8)
plt.fill_between(timeSeries['ds'], timeSeries['yhat_lower'], timeSeries['yhat_upper'], color='pink', alpha=0.3, label='Confidence Interval')

plt.xlabel('Date')
plt.ylabel('Number of Visits')
plt.title('Actual vs Predicted Visits')
plt.legend()
plt.show()

### 7.5.4 Japanese

In [None]:
timeSeries = df_agg['Japanese'].copy(deep=True)
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(timeSeries.index, timeSeries)
plt.show()

timeSeries = timeSeries.reset_index()
timeSeries = timeSeries[['index', 'Japanese']]
timeSeries.columns = ['ds', 'y']
timeSeries['ds'] = pd.to_datetime(timeSeries['ds'])
timeSeries.tail()

model = Prophet(interval_width=0.95, weekly_seasonality=True)
model.fit(timeSeries)
forecast_dates = model.make_future_dataframe(periods=0)
forecast = model.predict(forecast_dates)

timeSeries['yhat'] = forecast['yhat']
timeSeries['yhat_upper'] = forecast['yhat_upper']
timeSeries['yhat_lower'] = forecast['yhat_lower']

(_,_,_) = performance(timeSeries['y'], timeSeries['yhat'], print_metrics=True)

# Plot actual vs predicted visits
plt.figure(figsize=(15, 5))
plt.plot(timeSeries['ds'], timeSeries['y'], label='Actual Visits', color='blue')
plt.plot(timeSeries['ds'], timeSeries['yhat'], label='Predicted Visits', color='red', alpha=0.8)
plt.fill_between(timeSeries['ds'], timeSeries['yhat_lower'], timeSeries['yhat_upper'], color='pink', alpha=0.3, label='Confidence Interval')

plt.xlabel('Date')
plt.ylabel('Number of Visits')
plt.title('Actual vs Predicted Visits')
plt.legend()
plt.show()

### 7.5.5 Russian

In [None]:
timeSeries = df_agg['Russian'].copy(deep=True)
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(timeSeries.index, timeSeries)
plt.show()

timeSeries = timeSeries.reset_index()
timeSeries = timeSeries[['index', 'Russian']]
timeSeries.columns = ['ds', 'y']
timeSeries['ds'] = pd.to_datetime(timeSeries['ds'])
timeSeries.tail()

model = Prophet(interval_width=0.95, weekly_seasonality=True)
model.fit(timeSeries)
forecast_dates = model.make_future_dataframe(periods=0)
forecast = model.predict(forecast_dates)

timeSeries['yhat'] = forecast['yhat']
timeSeries['yhat_upper'] = forecast['yhat_upper']
timeSeries['yhat_lower'] = forecast['yhat_lower']

(_,_,_) = performance(timeSeries['y'], timeSeries['yhat'], print_metrics=True)

# Plot actual vs predicted visits
plt.figure(figsize=(15, 5))
plt.plot(timeSeries['ds'], timeSeries['y'], label='Actual Visits', color='blue')
plt.plot(timeSeries['ds'], timeSeries['yhat'], label='Predicted Visits', color='red', alpha=0.8)
plt.fill_between(timeSeries['ds'], timeSeries['yhat_lower'], timeSeries['yhat_upper'], color='pink', alpha=0.3, label='Confidence Interval')

plt.xlabel('Date')
plt.ylabel('Number of Visits')
plt.title('Actual vs Predicted Visits')
plt.legend()
plt.show()

### 7.5.6 Spanish

In [None]:
timeSeries = df_agg['Spanish'].copy(deep=True)
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(timeSeries.index, timeSeries)
plt.show()

timeSeries = timeSeries.reset_index()
timeSeries = timeSeries[['index', 'Spanish']]
timeSeries.columns = ['ds', 'y']
timeSeries['ds'] = pd.to_datetime(timeSeries['ds'])
timeSeries.tail()

model = Prophet(interval_width=0.95, weekly_seasonality=True)
model.fit(timeSeries)
forecast_dates = model.make_future_dataframe(periods=0)
forecast = model.predict(forecast_dates)

timeSeries['yhat'] = forecast['yhat']
timeSeries['yhat_upper'] = forecast['yhat_upper']
timeSeries['yhat_lower'] = forecast['yhat_lower']

(_,_,_) = performance(timeSeries['y'], timeSeries['yhat'], print_metrics=True)

# Plot actual vs predicted visits
plt.figure(figsize=(15, 5))
plt.plot(timeSeries['ds'], timeSeries['y'], label='Actual Visits', color='blue')
plt.plot(timeSeries['ds'], timeSeries['yhat'], label='Predicted Visits', color='red', alpha=0.8)
plt.fill_between(timeSeries['ds'], timeSeries['yhat_lower'], timeSeries['yhat_upper'], color='pink', alpha=0.3, label='Confidence Interval')

plt.xlabel('Date')
plt.ylabel('Number of Visits')
plt.title('Actual vs Predicted Visits')
plt.legend()
plt.show()