# New model interface example

This notebook aims to show how to use the new ViEWS2 modelling interface.

Todo:
* Map plot at the end

In [1]:
# Logging imports
import json
import logging
import views

logging.basicConfig(
    level=logging.DEBUG,
    #level=logging.INFO, # uncomment this and comment debug above for less yelling in red
    format=views.config.LOGFMT,
)

In [None]:
# DATASETS is a dictionary of Dataset objects.
from views import DATASETS
# These are the building blocks of the modelling interface
from views import Ensemble, Model, Downsampling, Period
# These are model specifications from the specfiles
from views.specs.models import cm as model_specs_cm, pgm as model_specs_pgm
from views.specs.periods import get_periods, get_periods_by_name
# Utils
from views.utils import db, io, data as datautils
from views.utils.data import assign_into_df

In [None]:
# These are the core models defined in the ViEWS pipeline
# These are defined in 
from views.apps.pipeline.models_cm import all_cm_models_by_name
from views.apps.pipeline.models_pgm import all_pgm_models_by_name

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [None]:
# Show the available datasets we have specified
for name, dataset in DATASETS.items():
    print(name)
    

In [None]:
# Do you wish to fetch the latest public data? If so, change False to True and run this cell
# Cells below will fail if this is not run if you haven't imported data yourself yet.
if False:
    path_zip = views.apps.data.public.fetch_latest_zip_from_website(path_dir_destination=views.DIR_SCRATCH)
    views.apps.data.public.import_tables_and_geoms(tables=views.TABLES, geometries=views.GEOMETRIES, path_zip=path_zip)

In [None]:
# Get the dataframe for a particular dataset. 
# If it doesn't exist cached on your machine it will be fetched from db and transforms computed for you
# Datasets are defined in views/specs/data/
dataset = views.DATASETS["cm_africa_imp_0"]

In [None]:
# Change False to True to rebuild this datasest if you have updated tables
if False:
    dataset.refresh()

In [None]:
# Get the actual dataframe
df = dataset.df
df.head()

In [None]:
# Used for looking up periods
# d is for development
run_id = "d_2020_04_01"

In [None]:
# Periods are defined as simple Period objects with 4 attributes, train/predict_start/end
# Models expect a list of Periods so they know which training times to use
# and which times to make predictions for
periods = get_periods(run_id) # as a list
periods_by_name = get_periods_by_name(run_id) # as a dict
period_a = periods_by_name["A"]
period_b = periods_by_name["B"]
period_c = periods_by_name["C"]

In [None]:
# You can also define periods yourself
period_custom = Period(name="A", train_start=201, train_end=396, predict_start=397, predict_end=432)

In [None]:
Model?

In [None]:
steps = [1, 12, 24, 36]
downsampling_10pct = Downsampling(share_positive=1.0, share_negative=0.1)

In [None]:
features_a = ["time_since_ged_dummy_sb", "time_since_ged_dummy_ns", "time_since_ged_dummy_os"]
my_model = Model(
    name = "cm_sb_mymodel", 
    col_outcome = "greq_25_ged_best_sb", 
    cols_features = features_a,
    steps = steps,
    periods = periods,
    outcome_type = "prob",
    estimator = RandomForestClassifier(n_jobs=-1, n_estimators=100),
    tags=["sb"]
)

my_downsampled_model = Model(
    name = "cm_sb_mymodel_downsampled", 
    col_outcome = "greq_25_ged_best_sb", 
    cols_features = features_a,
    steps = steps,
    periods = periods,
    outcome_type = "prob",
    downsampling=downsampling_10pct,
    estimator = RandomForestClassifier(n_jobs=-1, n_estimators=100),
    tags=["sb", "downsampled"]
)

# Notice how the col_outcome is the same
# The model itself does the onset transformation
# and subsets the training data itself
# Also transforms outcome col for calibration
# Evaluation and predicting not affected by onset yet
my_onsetmodel = Model(
    name = "cm_sb_onset_mymodel",
    col_outcome = "greq_25_ged_best_sb",
    cols_features = features_a,
    steps=steps,
    periods=periods,
    outcome_type = "prob",
    estimator = RandomForestClassifier(n_jobs=-1, n_estimators=100),
    onset_outcome=True, # <-- Onset switch
    onset_window=24, # <-- Must be accompanied by an onset time window
    tags=["sb", "onset"]
)

my_delta_model = Model(
    name="delta_force",
    col_outcome="ged_best_sb",
    cols_features=features_a,
    steps=steps,
    periods=periods,
    outcome_type="real",
    delta_outcome=True,
    tags=["delta"]
)

In [None]:
# Models can also be loaded from the definition in the pipeline
print(all_cm_models_by_name.keys())
model_from_pipeline_spec = all_cm_models_by_name["cm_sb_icgcw"]
# Just printing the object should show everything we care about
model_from_pipeline_spec

In [None]:
# Lists of models are convenient
models = [my_model, my_downsampled_model, my_onsetmodel, model_from_pipeline_spec]
models = [my_model, my_downsampled_model, my_onsetmodel]


my_avg_ensemble = Ensemble(
    name="my_avg_ensemble", 
    models=models, 
    outcome_type="prob", 
    col_outcome="greq_25_ged_best_sb", 
    method="average", 
    periods=periods
)
my_ebma_ensemble = Ensemble(
    name="my_ebma_ensemble", 
    models=models, 
    outcome_type="prob", 
    col_outcome="greq_25_ged_best_sb", 
    method="ebma", 
    periods=periods
)

ensembles = [my_avg_ensemble]
# When R installation and EBMA setup are part of the standard installer:
# include the EBMA ensemble here
#ensembles = [my_avg_ensemble, my_ebma_ensemble]

In [None]:
%%time
# Fit estimator for their specified steps and periods 
# Estimators are stored on disk with only a reference in the model object
# This could be omitted after the first run of the notebook
for model in models:
    model.fit_estimators(df , populate_extras = False)

In [None]:
# Predict and store predictions for their specified steps and periods in df
for model in models:
    
    # Uncalibrated predictions
    df_pred = model.predict(df)
    # assign_into_df takes care to only overwrite rows with actual values
    # This way we can keep all periods in the same df
    # It's also idempotent, no joining, so run as many times as you like. 
    df = assign_into_df(df_to=df, df_from=df_pred)
    
    # Calibrated predictions
    df_pred = model.predict_calibrated(
        df=df, 
        period_calib=period_a,
        period_test=period_b,
    )
    df = assign_into_df(df_to=df, df_from=df_pred)
    df_pred = model.predict_calibrated(
        df=df, 
        period_calib=period_b,
        period_test=period_c,
    )
    df = assign_into_df(df_to=df, df_from=df_pred)

In [None]:
for ensemble in ensembles:
    df_pred = ensemble.predict(
        df=df,
        period_calib=period_a,
        period_test=period_b,
    )
    df = assign_into_df(df_to=df, df_from=df_pred)
    df_pred = ensemble.predict(
        df=df,
        period_calib=period_b,
        period_test=period_c,
    )
    df = assign_into_df(df_to=df, df_from=df_pred)
    
    

In [None]:
# Evaluate all models
for model in models:
    model.evaluate(df)

In [None]:
# Evaluate all ensembles, limit to B and C. 
# In future evaluate will figure out itself where it has predictions to evaluate and this will be just one call.
for ensemble in ensembles:
    ensemble.evaluate(df, period=periods_by_name["B"])
    ensemble.evaluate(df, period=periods_by_name["C"])

In [None]:
for model in models:
    print(model.name)
    #print(model.scores)
    print("EVAL SCORES:")
    print(json.dumps(model.scores, indent=2))
    print("FEATURE_IMPORTANCES")
    print(json.dumps(model.extras.feature_importances, indent=2))
    print("#"*80)
    

In [None]:
# Ignore the uncalibrated scores, they are identical to calibrated.
# Evaluation needs a bit of a refactor
for ensemble in ensembles:
    print(ensemble.name)
    print("Weights:")
    print(json.dumps(ensemble.weights, indent=2))
    print("Eval scores:")
    print(json.dumps(ensemble.evaluation.scores, indent=2))
    print("#"*80)

In [None]:
# Access individual eval scores like a dict
print(models[0].name)
# Period B step 1
models[0].scores["B"][1]

In [None]:
# Notice all features and predictions in the same dataframe, no more a/b/c 
# Instead we subset by the periods when needed

cols_predict = [model.col_sc_calibrated for model in models] + [ensemble.col_sc for ensemble in ensembles]

# All calibrated predictions for period C
df.loc[period_c.times_predict, cols_predict]

In [None]:
estimator_a_1 = my_downsampled_model.estimators.get(period_name="A", step=1)
estimator_a_1

In [None]:
my_downsampled_model.extras.feature_importances