# ViEWS prediction competition example notebook

Hello and welcome to the ViEWS prediction competition material.
The aim of this notebook is to show you a simple interface to how ViEWS approaches making predictions with what we call step shifting.

The idea is to use features at time $t$ to predict events at $t+s$.
For a detailed explanation of how this works in theory, complete with rigourous notation, see the ViEWS publications.
For a detailed explanation of how this works in practice, look at the code in views/apps/model/api.py of this repository. 

Because keeping track of shifting time back and forth can be a bit tricky we have written a simplified interface so that you can focus on modelling and features and (hopefully) be confident that the time shifting is taken care of.
We encourage you to take a look at the underlying code in this repository to understand what is going on and find any nasty mistakes we might have made. 

In [None]:
import sys
import logging

import pandas as pd

# Import any other sklearn estimator you would like. 
# See https://scikit-learn.org/stable/user_guide.html#user-guide
from sklearn.ensemble import RandomForestRegressor

import views

from views.config import LOGFMT
from views.utils.io import csv_to_df
from views.utils.data import assign_into_df
from views.apps.transforms import lib as translib
from views.apps.model import api


In [None]:
logging.basicConfig(format=LOGFMT, stream=sys.stdout, level=logging.INFO)

In [None]:
paths = views.utils.extras.fetch_prediction_competition_data()
print(paths)

In [None]:
# Select a dataset
df = csv_to_df(paths["cm.csv"]).set_index(["month_id", "country_id"]).sort_index()
#df = csv_to_df(paths["pgm.csv"]).set_index(["month_id", "pg_id"]).sort_index()

In [None]:
# See which columns we have to work with
for col in df:
    print(col)

In [None]:
# Define our 2017.01-2019.12 development period
# Keeping periods in a list lets us easily expand this as the 
# updated data becomes available
period_develop = api.Period(
    name="develop", 
    train_start=121,   # 1990-01
    train_end=444,     # 2016.12
    predict_start=445, # 2017.01
    predict_end=480,   # 2019.12
)
periods = [period_develop]

In [None]:
# The steps to train, predict and evaluate for.
steps = [1,2,3,5,6]

In [None]:
# Transformations. See views.apps.transforms.lib for more options
# Feel free to add any you like
df["ma_12_ged_best_sb"] = translib.moving_average(df["ged_best_sb"], time=12)
df["ma_12_ged_best_ns"] = translib.moving_average(df["ged_best_ns"], time=12)
df["ma_12_ged_best_os"] = translib.moving_average(df["ged_best_os"], time=12)
df["time_since_ged_dummy_sb"] = translib.time_since_previous_event(df["ged_dummy_sb"])
df["time_since_ged_dummy_ns"] = translib.time_since_previous_event(df["ged_dummy_ns"])
df["time_since_ged_dummy_os"] = translib.time_since_previous_event(df["ged_dummy_os"])

In [None]:
# Specify your wanted feature sets
cols_features_raw = [
    "ged_best_sb",
    "ged_best_ns",
    "ged_best_os",
]
cols_features_transforms = [
    "ma_12_ged_best_sb",
    "ma_12_ged_best_ns",
    "ma_12_ged_best_os",
    "time_since_ged_dummy_sb",
    "time_since_ged_dummy_ns",
    "time_since_ged_dummy_os",
]

In [None]:
# Specify an optional downsampling level
downsample_half = api.Downsampling(share_positive=0.5, share_negative=0.5, threshold=0)

In [None]:
# Define the models

model_raw = api.Model(
    name="raw",                      # A descripte name
    col_outcome="ln_ged_best_sb",    # The outcome column, log of state-based fatalities
    cols_features=cols_features_raw, # The list of features
    steps=steps,                     # The list of steps
    outcome_type="real",             # The outcome type, can be "real" or "prob"
    periods=periods,                 # The list of periods to work on
    estimator=RandomForestRegressor( # Defining the estimator to use
        criterion="mse",
    ),
    delta_outcome = True             # Specifies that the model should take the delta 
                                     # of the outcome column before training and when evaluation
)

model_transforms = api.Model(
    name="raw_and_transforms",
    col_outcome="ln_ged_best_sb",
    cols_features=cols_features_raw + cols_features_transforms,
    steps=steps,
    outcome_type="real",
    periods=periods,
    estimator=RandomForestRegressor(
        criterion="mse",
    ),
    delta_outcome = True,
    downsampling=downsample_half
)

models = [model_raw, model_transforms]

In [None]:
# Train all models
for model in models:
    model.fit_estimators(df)

In [None]:
# Store predictions for all models in our dataframe
for model in models:
    df_predictions = model.predict_steps(df)
    df = assign_into_df(df, df_predictions)

In [None]:
# Evaluate all models. Scores are stored in the model object
for model in models:
    model.evaluate(df)

In [None]:
# Show our scores, it looks like transforms did some good. 
for model in models:
    print(model.name)
    print(pd.DataFrame(model.scores))