# What to do next:
## Modeling
### CV the performance
### Add a dummy variable for December ~
### Prevent the forecast from falling below 0 ~
### Train an AR 1 as a baseline
### Past three months' MAE ~
### Confidence Intervals ~

## Layout
### Retext the plot

In [1]:
# Basic operations
import numpy as np
import pandas as pd

# Dashboard
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

# Plotly
import plotly.express as px
import plotly.graph_objects as go

# Forecast
from fbprophet import Prophet

# Other tools
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
import concurrent

In [2]:
# Load Data
data = {}
data['fr'] = pd.read_csv("UMW_FR_reduced.csv", sep=",", parse_dates=['date_value'], index_col=['date_value'])
data['it'] = pd.read_csv("UMW_IT_reduced.csv", sep=",", parse_dates=['date_value'], index_col=['date_value'])
data['es'] = pd.read_csv("UMW_ES_reduced.csv", sep=",", parse_dates=['date_value'], index_col=['date_value'])


Columns (19,20,33,34) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (19,20,33,34,35) have mixed types. Specify dtype option on import or set low_memory=False.



In [26]:
# Options
country = 'fr'
movie = 'Requiem for a Dream' #'Requiem for a Dream'
h = 3 # set the forecast horizen to 6 months

In [32]:
# Extract data
df = data[country]
df = df[df['original_title'] == movie]

In [33]:
# Remove the incomplete data in November
df = df[:'2019-10-31']

# Resample data on a monthly basis
df = df.resample('1M')['royalties_paid_eur'].sum()
df = df.reset_index()
df.columns = ['ds', 'y']

In [63]:
# Fit the model
# Log transformation to avoid negative predictions
df['y'] = np.log(df['y'] + 1) #Plus 1 to avoid running into 0-revenue months
model = Prophet(seasonality_mode='multiplicative')
model.add_seasonality(name='monthly', period=30.5, fourier_order=5)
model.fit(df)
future = model.make_future_dataframe(periods = h, freq = 'M')
forecast = model.predict(future)
df['y'] = np.exp(df['y'])
forecast[['yhat_lower','yhat_upper', 'yhat']] = np.exp(forecast[['yhat_lower','yhat_upper', 'yhat']])

In [64]:
# Extract confidence intervals
conf = forecast[['ds','yhat_upper', 'yhat_lower']]
conf = conf.iloc[-h:]
conf = conf.append({'ds':df.iloc[-1,0], 
                    'yhat_upper':df.iloc[-1,1], 
                    'yhat_lower':df.iloc[-1,1]}, 
                    ignore_index=True)
conf = conf.sort_values('ds')


# Put the forecast and factual data into the same dataframe
df['type'] = 'past'

forecast = forecast.iloc[-h:]
forecast = forecast[['ds', 'yhat']]
forecast.columns = ['ds','y']
forecast['type'] = 'forecast'
df = pd.concat([df, forecast],axis=0)

# Round the number down to 2 digits after the decimal points
df['y'] = np.round(df['y'], 2)

In [78]:
# Plot the result
m_past = df['type'] == 'past'
m_fore = df['type'] == 'forecast'

fig = go.Figure()
# Factual data
fig.add_trace(go.Scatter(x=df['ds'][m_past], y=df['y'][m_past], 
                         mode='lines', name='past', line_color='#1f77b4'))
# Forecasted data
fig.add_trace(go.Scatter(x=df['ds'][m_fore], y=df['y'][m_fore], 
                         mode='lines+markers', name='forecast', line_color='#ff7f0e'))
# The line connecting past and forecasted points
fig.add_trace(go.Scatter(x=[df['ds'][m_past].iloc[-1], df['ds'][m_fore].iloc[0]], 
                         y=[df['y'][m_past].iloc[-1], df['y'][m_fore].iloc[0]], 
                         mode='lines', name='forecast', 
                         hoverinfo='skip', line_color='#ff7f0e', showlegend=False))
# Confidence interval
fig.add_trace(go.Scatter(x=conf['ds'], y=conf['yhat_lower'], 
                         mode='lines', name='possible range', 
                         fill=None, line_color='#ff7f0e', opacity=0.1))
fig.add_trace(go.Scatter(x=conf['ds'], y=conf['yhat_upper'], 
                         mode='lines', name='possible range', 
                         fill='tonexty', line_color='#ff7f0e', opacity=0.1))


fig.update_layout(title='Royalties Forecasting for ' + movie,
                  xaxis_title='Year',
                  yaxis_title='Royalities (in euros)')

fig.show()

In [80]:
# MAE for the past h months of the movie of interest
CV_fp(country, movie, h)

16.799893683170325

# Cross Validation

In [79]:
# A funtion to CV the n-month-ahead forecast a movie with at least 6 months of revenue data
# Input: country, movie, number of steps
# Output: mean-absolute-error of the predictions
def CV_fp(country, movie, h):
    # Extract data
    df = data[country]
    df = df[df['original_title'] == movie]

    # Remove the incomplete data in November
    df = df[:'2019-10-31']

    # Resample data on a monthly basis
    df = df.resample('1M')['royalties_paid_eur'].sum()
    df = df.reset_index()
    df.columns = ['ds', 'y']
    
    #! If a movie has fewer than 6 months of data, output None
    if df.shape[0]<6:
        return None
    
    #! Remove the last h months
    df_real = df.iloc[-h:]
    df = df.iloc[:-h]

    # Fit the model
    # Log transformation to avoid negative predictions
    df['y'] = np.log(df['y'] + 1)
    model = Prophet(seasonality_mode='multiplicative')
    model.add_seasonality(name='monthly', period=30.5, fourier_order=5)
    model.fit(df)
    future = model.make_future_dataframe(periods = h, freq = 'M')
    forecast = model.predict(future)
    df['y'] = np.exp(df['y'])
    forecast['yhat'] = np.exp(forecast['yhat'])
    
    #! Extract the forecast
    forecast = forecast.iloc[-h:]
    
    #! Return MAE
    loss = np.mean(np.abs(forecast['yhat'] - df_real['y']))
    
    return loss

In [None]:
# A funtion to go through movies using multi-threads
# Input: a CV funtion
# Output: MAE acrsoss all the movies
def CV_run(f):
    res = pd.DataFrame(columns=['country', 'movie', 'loss']) # Base Table
    
    def helper(country):
        temp = data[country]
        temp = temp['original_title'].unique() # A list of movies inside that country
        res_sub = pd.DataFrame(columns=['country', 'movie', 'loss'])
        for movie in temp[:2]: # TEST
            loss = f(country, movie, h)
            res_sub = res_sub.append({'country':country, 'movie':movie, 'loss':loss}, ignore_index=True)
            
            #Display and save the progress
            if res_sub.shape[0] % 1 == 0: # TEST
                print('Progress', country, res_sub.shape[0], '/', temp.shape[0])
                res_sub.to_csv(country + '.csv')
        return res_sub
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        res_subs = executor.map(helper, countries)
    
    for res_sub in res_subs:
        res = res.append(res_sub)
    return res

In [None]:
res=CV_run(CV_fp)

In [None]:
# + Past three months' MAE
# + Confidence Intervals

# + Access their database

# Dashboard

In [6]:
# A function to wrap up the forecast
def forecast_wrap(country = 'fr', movie = 'Requiem for a Dream', h = 3):
    # Extract data
    df = data[country]
    df = df[df['original_title'] == movie]
    
    # Remove the incomplete data in November
    df = df[:'2019-10-31']

    # Resample data on a monthly basis
    df = df.resample('1M')['royalties_paid_eur'].sum()
    df = df.reset_index()
    df.columns = ['ds', 'y']
    
    # Fit the model
    # Log transformation to avoid negative predictions
    df['y'] = np.log(df['y'] + 1) #Plus 1 to avoid running into 0-revenue months
    model = Prophet(seasonality_mode='multiplicative')
    model.add_seasonality(name='monthly', period=30.5, fourier_order=5)
    model.fit(df)
    future = model.make_future_dataframe(periods = h, freq = 'M')
    forecast = model.predict(future)
    df['y'] = np.exp(df['y'])
    forecast['yhat'] = np.exp(forecast['yhat'])

    # Put the forecast and factual data into the same dataframe
    df['type'] = 'past'

    forecast = forecast.iloc[-h:]
    forecast = forecast[['ds', 'yhat']]
    forecast.columns = ['ds','y']
    forecast['type'] = 'forecast'
    df = pd.concat([df, forecast],axis=0)

    # Round the number down to 2 digits after the decimal points
    df['y'] = np.round(df['y'], 2)

    # Plot the result
    m_past = df['type'] == 'past'
    m_fore = df['type'] == 'forecast'

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df['ds'][m_past], y=df['y'][m_past], mode='lines', name='past'))
    fig.add_trace(go.Scatter(x=df['ds'][m_fore], y=df['y'][m_fore], mode='lines+markers', name='forecast'))
    fig.add_trace(go.Scatter(x=[df['ds'][m_past].iloc[-1], df['ds'][m_fore].iloc[0]], 
                             y=[df['y'][m_past].iloc[-1], df['y'][m_fore].iloc[0]], 
                             mode='lines', name='forecast', 
                             hoverinfo='skip', line_color='#ff7f0e', showlegend=False))
    #The line connecting past and forecasted points

    fig.update_layout(title='Royalties Forecasting for ' + movie,
                      xaxis_title='Year',
                      yaxis_title='Royalities (in euros)')
    
    return fig

In [5]:
countries = ['fr', 'es', 'it']

In [None]:
# Build a dashboard
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = dash.Dash(__name__, external_stylesheets=external_stylesheets)

app.layout = html.Div([
    
    html.Div([
    dcc.Dropdown(
                id='country',
                options=[{'label': i, 'value': i} for i in countries],
                value='fr'
            ),
    dcc.Dropdown(
                id='movie'
            ),        
    ]),
    
    dcc.Graph(id='forecast'),

])

@app.callback(
    Output('movie', 'options'),
    [Input('country', 'value')])
def update_movie_list(country):
    temp = data[country]
    temp = temp['original_title'].unique()
    return [{'label': i, 'value': i} for i in temp]

@app.callback(
    Output('forecast', 'figure'),
    [Input('country', 'value'),
     Input('movie', 'value')])
def update_figure(country, movie):
    return forecast_wrap(country, movie)

if __name__ == '__main__':
    app.run_server(debug=False)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


INFO:werkzeug: * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
INFO:werkzeug:127.0.0.1 - - [22/Jan/2020 11:50:37] "[37mGET / HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [22/Jan/2020 11:50:37] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [22/Jan/2020 11:50:37] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [22/Jan/2020 11:50:38] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [22/Jan/2020 11:50:38] "[1m[35mPOST /_dash-update-component HTTP/1.1[0m" 500 -
INFO:werkzeug:127.0.0.1 - - [22/Jan/2020 11:50:50] "[1m[35mPOST /_dash-update-component HTTP/1.1[0m" 500 -
INFO:werkzeug:127.0.0.1 - - [22/Jan/2020 11:50:50] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [22/Jan/2020 11:50:54] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [22/Jan/2020 11:51:43] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
INF