# Scoring Script Test
The purpose of this test is a proof of concept implementation of NannyML performance estimation into the bridging scoring script. This could then be used to analyse data drift.

In [1]:
import pickle
import numpy as np
import re
import pandas as pd
import datetime as dt
import time
import matplotlib.pyplot as plt;
import warnings; warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from pom_NEW import *
import os
import sys
import gcsfs
sys.path.append(os.path.abspath("/home/jupyter/POM-feature-drift"))
from google.cloud import storage
from google.cloud import bigquery
import project_config as pc
import common_variables as cv

In [2]:
table_id = 'offer_bridging_eoo_base'
bucket_location = 'EU'
bucket_id       = 'gs://'+pc.bucket+'/pom_scoring'
file_name       = 'eoo_base_' + str(dt.datetime.now().date())
file_format     = 'CSV'
gcs_file_path   = os.path.join(bucket_id,file_name+'_*.csv')
prefix_name = 'pom_scoring'

In [3]:
#Connection to BQ
client = bigquery.Client(project=pc.project_id) #;
query = """ SELECT column_name, data_type
FROM """+pc.target_dataset+""".INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = 'offer_bridging_eoo_base'
AND data_type = 'DATE'
"""

date_cols = client.query(query).to_dataframe().iloc[:, 0].tolist()

In [4]:
#Load columns used in new models - speeds up process as we don't need to keep unused columns
cols = {'Account_Number', 'base_dt', 'eoo_base_obs_dt', 'Cohort', 'Rack_Rate', 'Ttl_Offer_Discount', 'Customer_Type', 'Country'}
for customer_type in ['DTV']:
    for country in ['UK']:
        for target_type in ['arpu', 'churn', 'ta']:
            for model_type in ['NT', 'L', 'M', 'H']:
                pickle_name = f'pickle_files/{customer_type}_{country}_{target_type}_{model_type}.pkl'
                if os.path.isfile(pickle_name):
                    with open(pickle_name, 'rb') as pickle_file:
                        model = pickle.load(pickle_file)
                    if (type(model) is XGBRegressor) or (type(model) is XGBClassifier):
                        model_columns = model.get_booster().feature_names
                    elif (type(model) is LGBMRegressor) or (type(model) is LGBMClassifier):
                        model_columns = model.feature_name_
                    elif (type(model) is CombinedModel):
                        model_columns = model.feature_name_
                    else:
                        print(f'MODEL TYPE NOT MATCHED {pickle_name}')
                        model_columns = []
                    if model_columns is None:
                        print(f'NONE COLUMNS {pickle_name}')
                        model_columns = []
                    cols = cols.union(model_columns)
rename_andrew = fix_columns()
cols = cols.union({x for x, y in rename_andrew.items() if y in cols})
cols = cols.union({x.rsplit('_', 1)[0] for x in cols})

In [5]:
#Load scoring data from GCS to pandas dataframe
df_main = read_pomdata_to_score(project = pc.project_id, bucket_name = pc.bucket, prefix_name = prefix_name)

blobs are ['pom-etl-process/pom_scoring/eoo_base_2022-12-05_000000000000.csv', 'pom-etl-process/pom_scoring/eoo_base_2022-12-05_000000000001.csv', 'pom-etl-process/pom_scoring/eoo_base_2022-12-05_000000000002.csv', 'pom-etl-process/pom_scoring/eoo_base_2022-12-05_000000000003.csv', 'pom-etl-process/pom_scoring/eoo_base_2022-12-05_000000000004.csv', 'pom-etl-process/pom_scoring/eoo_base_2022-12-05_000000000005.csv']
Total # of date variables are 167
Inital dataframe contains 456831 rows and 987 columns
Shape of the dataset is 456831 rows and 987 columns


## Clean EOO data

In [6]:
dpp = DataPreProcess(df=df_main.rename(columns={'EOO_Base_Obs_Dt' : 'eoo_base_obs_dt'}), cols=cols, date_cols=date_cols)

eoo_base_obs_dt available in the date df


In [7]:
del df_main

In [8]:
_ = dpp.process_dates()

In [9]:
_, _, _, _ = dpp.fill_missing()

In [11]:
_ = dpp.scale_numeric(excl_cols=['Account_Number', 'Movies_Active', 'Sports_Active', 'SGE_Active', 'HD_Active', 'MS_Active'])

In [12]:
_ = dpp.one_hot_encode(nunique=100)

[]


In [13]:
data_consolidated = dpp.concat_data(column_fix={'HD_Product_Holding_nan' : 'HD_Product_Holding_None'})
data_consolidated_scaled = dpp.concat_data(scale_numeric=True, column_fix=rename_andrew)

Data contains the total 456831 rows and 265 columns
Data contains the total 456831 rows and 265 columns


In [14]:
data_consolidated_uk_dtv = data_consolidated.loc[(data_consolidated['Country_UK'] == 1) & (data_consolidated['Customer_Type'] == 'DTV')]
data_consolidated_uk_dtv_scaled = data_consolidated_scaled.loc[(data_consolidated_scaled['Country_UK'] == 1) & (data_consolidated_scaled['Customer_Type'] == 'DTV')]

In [15]:
stepup_models = ['NT', 'L', 'M', 'H']

## Score TA Models

In [16]:
data_dict_uk_dtv_stepup = score_data(df=data_consolidated_uk_dtv_scaled, customer_type='DTV', country='UK', target='target_ta', model_types=stepup_models)

************************Model file DTV_UK_ta_NT exists************************
************************Scored data for model DTV_UK_ta_NT************************
************************Model file DTV_UK_ta_L exists************************
************************Scored data for model DTV_UK_ta_L************************
************************Model file DTV_UK_ta_M exists************************
************************Scored data for model DTV_UK_ta_M************************
************************Model file DTV_UK_ta_H exists************************
************************Scored data for model DTV_UK_ta_H************************


In [17]:
scored_df_ta_uk_dtv_new = pd.concat(list(data_dict_uk_dtv_stepup.values()), axis=1)
scored_df_ta = pd.concat([scored_df_ta_uk_dtv_new])

In [18]:
del scored_df_ta_uk_dtv_new

## NannyML Performance Estimation Implementation