# Reference Data Creation
Here we will create the reference data for use in our NannyML test notebooks.
First we will create the data for the scoring script test. This will be 'full' validation reference data, meaning it will be comprised of all five validation cohorts (to try and replecate how it would be implemented in production).
Then we will create reference data for the validation test that will comprise of the first three validation cohorts. The reason for this is that we can use the final two cohorts as analysis data to see how the performance estimation works.
Here we will also create pickled drift artefacts for use in the previously mentioned tests.

In [1]:
import pickle
import numpy as np
import re
import pandas as pd
import datetime as dt
import time
import matplotlib.pyplot as plt;
import warnings; warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import os
import sys
import gcsfs
sys.path.append(os.path.abspath("/home/jupyter/POM-feature-drift"))

# Scoring Script Data

## TA

In [27]:
validation_cohorts = ['202045-202052', '202053-202107', '202108-202111', '202116-202119', '292124-202127']
for treatment in ['nt', 'low', 'medium', 'high']:
    df = pd.DataFrame()
    for cohort in validation_cohorts:
        gcs_path = f'gs://andrew-pom/Revenue/ta_step_up_{treatment}_v2_{cohort}_oot.csv'
        cohort_df = pd.read_csv(gcs_path, low_memory = False)
        df = pd.concat([df, cohort_df])

    dates = []
    for index in df.ind:
        dates.append(index.split('-')[0])

    timestamps = []
    for i, date in enumerate(dates):
        timestamps.append(pd.to_datetime(str(dates[i]) + '-0', format = '%Y%W-%w'))
    df['timestamp'] = timestamps

    df.drop(columns = ['ind', 'xgb_preds', 'xgb_proba', 'logr_preds', 'logr_proba'], inplace = True)
    df.rename(columns = {'lgbm_preds': 'pred_ta', 'lgbm_proba': 'pred_proba_ta'}, inplace = True)
    df.sort_values(by = 'timestamp', inplace = True)
    
    df.to_csv(f'ta_{treatment}_reference_full.csv', index = False)

## ARPU

In [28]:
validation_cohorts = ['202045-202052', '202053-202107', '202108-202111', '202116-202119', '292124-202127']
for treatment in ['nt', 'low', 'medium', 'high']:
    df = pd.DataFrame()
    for cohort in validation_cohorts:
        gcs_path = f'gs://andrew-pom/Revenue/revenue_step_up_{treatment}_6m_v3_{cohort}_oot.csv'
        cohort_df = pd.read_csv(gcs_path, low_memory = False)
        df = pd.concat([df, cohort_df])

    dates = []
    for index in df.ind:
        dates.append(index.split('-')[0])

    timestamps = []
    for i, date in enumerate(dates):
        timestamps.append(pd.to_datetime(str(dates[i]) + '-0', format = '%Y%W-%w'))
    df['timestamp'] = timestamps

    df.drop(columns = ['ind', 'xgbr_preds', 'linr_preds'], inplace = True)
    df.rename(columns = {'lgbmr_preds': 'pred_arpu'}, inplace = True)
    df.sort_values(by = 'timestamp', inplace = True)
    
    df.to_csv(f'arpu_{treatment}_reference_full.csv', index = False)