# Reference Data Creation
Here we will create the reference data for use in our NannyML test notebooks.
First we will create the data for the scoring script test. This will be 'full' validation reference data, meaning it will be comprised of all five validation cohorts (to try and replecate how it would be implemented in production).
Then we will create reference data for the validation test that will comprise of the first three validation cohorts. The reason for this is that we can use the final two cohorts as analysis data to see how the performance estimation works.
Here we will also create pickled drift artefacts for use in the previously mentioned tests.

In [5]:
import pickle
import numpy as np
import re
import pandas as pd
import datetime as dt
import time
import matplotlib.pyplot as plt;
import warnings; warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import os
import sys
import gcsfs
sys.path.append(os.path.abspath("/home/jupyter/POM-feature-drift"))

# Scoring Script Data

## TA

In [27]:
validation_cohorts = ['202045-202052', '202053-202107', '202108-202111', '202116-202119', '292124-202127']
for treatment in ['nt', 'low', 'medium', 'high']:
    df = pd.DataFrame()
    for cohort in validation_cohorts:
        gcs_path = f'gs://andrew-pom/Revenue/ta_step_up_{treatment}_v2_{cohort}_oot.csv'
        cohort_df = pd.read_csv(gcs_path, low_memory = False)
        df = pd.concat([df, cohort_df])

    dates = []
    for index in df.ind:
        dates.append(index.split('-')[0])

    timestamps = []
    for i, date in enumerate(dates):
        timestamps.append(pd.to_datetime(str(dates[i]) + '-0', format = '%Y%W-%w'))
    df['timestamp'] = timestamps

    df.drop(columns = ['ind', 'xgb_preds', 'xgb_proba', 'logr_preds', 'logr_proba'], inplace = True)
    df.rename(columns = {'lgbm_preds': 'pred_ta', 'lgbm_proba': 'pred_proba_ta'}, inplace = True)
    df.sort_values(by = 'timestamp', inplace = True)
    
    df.to_csv(f'ta_{treatment}_reference_full.csv', index = False)

## ARPU

In [28]:
validation_cohorts = ['202045-202052', '202053-202107', '202108-202111', '202116-202119', '292124-202127']
for treatment in ['nt', 'low', 'medium', 'high']:
    df = pd.DataFrame()
    for cohort in validation_cohorts:
        gcs_path = f'gs://andrew-pom/Revenue/revenue_step_up_{treatment}_6m_v3_{cohort}_oot.csv'
        cohort_df = pd.read_csv(gcs_path, low_memory = False)
        df = pd.concat([df, cohort_df])

    dates = []
    for index in df.ind:
        dates.append(index.split('-')[0])

    timestamps = []
    for i, date in enumerate(dates):
        timestamps.append(pd.to_datetime(str(dates[i]) + '-0', format = '%Y%W-%w'))
    df['timestamp'] = timestamps

    df.drop(columns = ['ind', 'xgbr_preds', 'linr_preds'], inplace = True)
    df.rename(columns = {'lgbmr_preds': 'pred_arpu'}, inplace = True)
    df.sort_values(by = 'timestamp', inplace = True)
    
    df.to_csv(f'arpu_{treatment}_reference_full.csv', index = False)

## ARPU Performance Estimation
We can pre-fit and pickle the regression performance estimators and then just open and run them on new data in the scoring script.

In [8]:
import pickle
import nannyml as nml

In [4]:
stepup_models = ['NT', 'L', 'M', 'H']
arpu_model_dict = {}
for treatment in stepup_models:
    with open(f'/home/jupyter/POM-feature-drift/pickle_files/DTV_UK_arpu_{treatment}.pkl', 'rb') as pkl_file:
        arpu_model_dict[treatment] = pickle.load(pkl_file)

In [25]:
for ref, treatment in zip(['nt', 'low', 'medium', 'high'], stepup_models):   
    ref_data = pd.read_csv(f'arpu_{ref}_reference_full.csv', low_memory = False)
    ref_data['timestamp'] = pd.to_datetime(ref_data['timestamp'])
    ref_data.sort_values(by = 'timestamp', inplace = True)
    features = arpu_model_dict[treatment].feature_name_

    estimator = nml.DLE(feature_column_names = features,
                     y_pred = 'pred_arpu',
                     y_true = 'target_arpu',
                     metrics = ['rmse', 'mae'],
                     timestamp_column_name = 'timestamp',
                     # chunk_period = 'W',
                     chunk_number = 10,
                    )

    estimator = estimator.fit(ref_data[features + ['pred_arpu', 'target_arpu', 'timestamp']])
                                                            
    with open(f'NannyML_results/arpu_{ref}_perf_estimator.pkl', 'wb') as pkl_file:
        pickle.dump(estimator, pkl_file, protocol = pickle.HIGHEST_PROTOCOL)                                                      

# Validation Tests Data

## TA

### Performance Estimation Data

In [10]:
validation_cohorts = ['202045-202052', '202053-202107', '202108-202111']
for treatment in ['nt', 'low', 'medium', 'high']:
    df = pd.DataFrame()
    for cohort in validation_cohorts:
        gcs_path = f'gs://andrew-pom/Revenue/ta_step_up_{treatment}_v2_{cohort}_oot.csv'
        cohort_df = pd.read_csv(gcs_path, low_memory = False)
        df = pd.concat([df, cohort_df])

    dates = []
    for index in df.ind:
        dates.append(index.split('-')[0])

    timestamps = []
    for i, date in enumerate(dates):
        timestamps.append(pd.to_datetime(str(dates[i]) + '-0', format = '%Y%W-%w'))
    df['timestamp'] = timestamps

    df.drop(columns = ['ind', 'xgb_preds', 'xgb_proba', 'logr_preds', 'logr_proba'], inplace = True)
    df.rename(columns = {'lgbm_preds': 'pred_ta', 'lgbm_proba': 'pred_proba_ta'}, inplace = True)
    df.sort_values(by = 'timestamp', inplace = True)
    
    df.to_csv(f'ta_{treatment}_reference_validation.csv', index = False)

### Data Drift Calculators

In [11]:
stepup_models = ['NT', 'L', 'M', 'H']
ta_model_dict = {}
for treatment in stepup_models:
    with open(f'/home/jupyter/POM-feature-drift/pickle_files/DTV_UK_ta_{treatment}.pkl', 'rb') as pkl_file:
        ta_model_dict[treatment] = pickle.load(pkl_file)

In [17]:
for treatment, model in zip(stepup_models, ['nt', 'low', 'medium', 'high']):
    # Read in full five cohort validation data for analysis
    analysis = pd.read_csv(f'ta_{model}_reference_full.csv', low_memory = False)
    analysis.timestamp = pd.to_datetime(analysis.timestamp)
    features = ta_model_dict[treatment].feature_name_
    
    # Read in scored training data as referenece
    df = pd.read_csv(f'gs://andrew-pom/Revenue/ta_step_up_{model}_v2_train_scored.csv', low_memory = False)
    
    # Setup reference data for use (timestamp, drop/rename columns, sort)
    dates = []
    for index in df.ind:
        dates.append(index.split('-')[0])

    timestamps = []
    for i, date in enumerate(dates):
        timestamps.append(pd.to_datetime(str(dates[i]) + '-0', format = '%Y%W-%w'))
    df['timestamp'] = timestamps

    df.drop(columns = ['ind', 'xgb_preds', 'xgb_proba', 'logr_preds', 'logr_proba'], inplace = True)
    df.rename(columns = {'lgbm_preds': 'pred_ta', 'lgbm_proba': 'pred_proba_ta'}, inplace = True)
    df.sort_values(by = 'timestamp', inplace = True)
    
    # Initialise NannyML drift calculator
    calculator = nml.UnivariateDriftCalculator(column_names = features + ['pred_proba_ta'],
                                                      timestamp_column_name = 'timestamp',
                                                      chunk_period = 'W',
                                                      continuous_methods = ['jensen_shannon'],
                                                      categorical_methods = ['jensen_shannon'])
    
    # Fit calculator to reference data
    calculator = calculator.fit(df[features + ['timestamp', 'pred_proba_ta']])
    
    # Pickle and save calculator so it can be loaded and used when needed
    with open(f'NannyML_results/ta_{model}_drift_calculator.pkl', 'wb') as pkl_file:
        pickle.dump(calculator, pkl_file, protocol = pickle.HIGHEST_PROTOCOL)

## ARPU

### Performance Estimation Data

In [19]:
validation_cohorts = ['202045-202052', '202053-202107', '202108-202111']
for treatment in ['nt', 'low', 'medium', 'high']:
    df = pd.DataFrame()
    for cohort in validation_cohorts:
        gcs_path = f'gs://andrew-pom/Revenue/revenue_step_up_{treatment}_6m_v3_{cohort}_oot.csv'
        cohort_df = pd.read_csv(gcs_path, low_memory = False)
        df = pd.concat([df, cohort_df])

    dates = []
    for index in df.ind:
        dates.append(index.split('-')[0])

    timestamps = []
    for i, date in enumerate(dates):
        timestamps.append(pd.to_datetime(str(dates[i]) + '-0', format = '%Y%W-%w'))
    df['timestamp'] = timestamps

    df.drop(columns = ['ind', 'xgbr_preds', 'linr_preds'], inplace = True)
    df.rename(columns = {'lgbmr_preds': 'pred_arpu'}, inplace = True)
    df.sort_values(by = 'timestamp', inplace = True)
    
    df.to_csv(f'arpu_{treatment}_reference_validation.csv', index = False)

### Performance Estimation Calculators

In [22]:
stepup_models = ['NT', 'L', 'M', 'H']
arpu_model_dict = {}
for treatment in stepup_models:
    with open(f'/home/jupyter/POM-feature-drift/pickle_files/DTV_UK_arpu_{treatment}.pkl', 'rb') as pkl_file:
        arpu_model_dict[treatment] = pickle.load(pkl_file)

In [26]:
for ref, treatment in zip(['nt', 'low', 'medium', 'high'], stepup_models):   
    ref_data = pd.read_csv(f'arpu_{ref}_reference_validation.csv', low_memory = False)
    ref_data['timestamp'] = pd.to_datetime(ref_data['timestamp'])
    ref_data.sort_values(by = 'timestamp', inplace = True)
    features = arpu_model_dict[treatment].feature_name_

    estimator = nml.DLE(feature_column_names = features,
                     y_pred = 'pred_arpu',
                     y_true = 'target_arpu',
                     metrics = ['rmse', 'mae'],
                     timestamp_column_name = 'timestamp',
                     # chunk_period = 'W',
                     chunk_number = 10,
                    )

    estimator = estimator.fit(ref_data[features + ['pred_arpu', 'target_arpu', 'timestamp']])
                                                            
    with open(f'NannyML_results/arpu_{ref}_perf_estimator_validation.pkl', 'wb') as pkl_file:
        pickle.dump(estimator, pkl_file, protocol = pickle.HIGHEST_PROTOCOL)  

### Data Drift Calculators

In [20]:
stepup_models = ['NT', 'L', 'M', 'H']
arpu_model_dict = {}
for treatment in stepup_models:
    with open(f'/home/jupyter/POM-feature-drift/pickle_files/DTV_UK_arpu_{treatment}.pkl', 'rb') as pkl_file:
        arpu_model_dict[treatment] = pickle.load(pkl_file)

In [24]:
for treatment, model in zip(stepup_models, ['nt', 'low', 'medium', 'high']):
    # Read in full five cohort validation data for analysis
    analysis = pd.read_csv(f'arpu_{model}_reference_full.csv', low_memory = False)
    analysis.timestamp = pd.to_datetime(analysis.timestamp)
    features = arpu_model_dict[treatment].feature_name_
    
    # Read in scored training data as referenece
    df = pd.read_csv(f'gs://andrew-pom/Revenue/revenue_step_up_{model}_6m_v3_train_scored.csv', low_memory = False)
    
    # Setup reference data for use (timestamp, drop/rename columns, sort)
    dates = []
    for index in df.ind:
        dates.append(index.split('-')[0])

    timestamps = []
    for i, date in enumerate(dates):
        timestamps.append(pd.to_datetime(str(dates[i]) + '-0', format = '%Y%W-%w'))
    df['timestamp'] = timestamps

    df.drop(columns = ['ind', 'xgbr_preds', 'linr_preds'], inplace = True)
    df.rename(columns = {'lgbmr_preds': 'pred_arpu'}, inplace = True)
    df.sort_values(by = 'timestamp', inplace = True)
    
    # Initialise NannyML drift calculator
    calculator = nml.UnivariateDriftCalculator(column_names = features + ['pred_arpu'],
                                                      timestamp_column_name = 'timestamp',
                                                      chunk_period = 'W',
                                                      continuous_methods = ['jensen_shannon'],
                                                      categorical_methods = ['jensen_shannon'])
    
    # Fit calculator to reference data
    calculator = calculator.fit(df[features + ['timestamp', 'pred_arpu']])
    
    # Pickle and save calculator so it can be loaded and used when needed
    with open(f'NannyML_results/arpu_{model}_drift_calculator.pkl', 'wb') as pkl_file:
        pickle.dump(calculator, pkl_file, protocol = pickle.HIGHEST_PROTOCOL)