In [None]:
#!pip install matplotlib

In [None]:
#!pip install scikit-learn

In [None]:
# import global modules
import os
import re
import sys
import time
import json
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from yaml import safe_load
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from scipy.stats import uniform, randint
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Set global vars
pth_project = Path(os.getcwd().split('notebooks')[0])
print(pth_project)
pth_data = pth_project / 'data'
pth_utils = pth_project / 'utils'
pth_queries = pth_project / 'queries'
pth_creds = pth_project / 'conf' / 'local' / 'project_config.yaml'
sys.path.insert(0, pth_project.as_posix())
d_config = safe_load(pth_creds.open())

# import local modules
from utils.gcp import connect_bq_services, connect_pandas_bq_services
from utils.extract import extract_bq_data
from utils.modeling import process_features, extract_stats

In [None]:
bq_client = connect_bq_services(d_config['gcp-project-name'])

In [None]:
%load_ext autoreload
%autoreload 2

#### Extract data

In [None]:
# extract trainning data
sql = f"""
  select *
    from `divg-team-v03-pr-de558a.nba_product_reco_model.nba_training_dataset_v7`
"""
df_train = extract_bq_data(bq_client, sql)
df_train.shape

In [None]:
# extract validation data
sql = f"""
  select *
    from `divg-team-v03-pr-de558a.nba_product_reco_model.nba_test_dataset_v7`
"""
df_val = extract_bq_data(bq_client, sql)
df_val.shape

In [None]:
# extract validation data
sql = f"""
  select *
    from `divg-team-v03-pr-de558a.nba_product_reco_model.nba_test_dataset_v7`
"""
df_test = extract_bq_data(bq_client, sql)
df_test.shape

#### Process data

In [None]:
#!pip install category-encoders
#import category_encoders as ce

In [None]:
# create target output
# d_target_mapping = {
#     scenario: i
#     for i, scenario in enumerate(df_train['model_scenario'].unique())
# }

d_target_mapping = {
 'sing_acquisition': 0,
 'shs_acquisition': 1,
 'tos_acquisition': 2,
 'wifi_acquisition': 3,
 'ttv_acquisition': 4,
 'sws_acquisition': 5,
 'hsic_acquisition': 6,
 'lwc_acquisition': 7,
 'hpro_acquisition': 8,
 'whsia_acquisition': 9
}

d_target_mapping

In [None]:
# load features metadata
d_features_metadata = safe_load((pth_utils / 'parameters' / 'acquisition_features_v5.yaml').open())

In [None]:
# process training data
df_train_processed = process_features(df_train, d_features_metadata, 'model_scenario', d_target_mapping)
df_val_processed = process_features(df_val, d_features_metadata, 'model_scenario', d_target_mapping)
df_test_processed = process_features(df_test, d_features_metadata, 'model_scenario', d_target_mapping)

In [None]:
# l_cat_features = [
#     'acct_cr_risk_txt',
#     'acct_ebill_ind',
#     'cust_cr_val_txt',
#     'cust_pref_lang_txt',
#     'cust_prov_state_cd'
# ]

In [None]:
# df_train_all_num = df_train[df_train.select_dtypes(exclude=['object']).columns]
# df_validation_all_num = df_validation[df_train_all_num.columns]

In [None]:
# for col in l_cat_features:
#     df_train_all_num[col] = df_train[col]
#     df_validation_all_num[col] = df_validation[col]

# for col in l_cat_features:
#     df_train_processed[col] = df_train[col]
#     df_validation_processed[col] = df_validation[col]

In [None]:
# df_train_all_num['fsa'] = df_train_processed['fsa']
# df_validation_all_num['fsa'] = df_validation_processed['fsa']

In [None]:
# df_train_all_num = df_train_all_num.fillna(0)
# df_validation_all_num = df_validation_all_num.fillna(0)

#### Split data

In [None]:
# Separate the features and target variable
ban_train = df_train[['ban', 'lpds_id']]
X_train = df_train_processed.drop(columns='target')
y_train = df_train_processed['target']

ban_val = df_val[['ban', 'lpds_id']]
X_val = df_val_processed.drop(columns='target')
y_val = df_val_processed['target']

ban_test = df_test[['ban', 'lpds_id']]
X_test = df_test_processed.drop(columns='target')
y_test = df_test_processed['target']

In [None]:
#X = X.fillna(0)
#X_val = X_val.fillna(0)

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# use target encoding to encode two categorical features
#enc = ce.TargetEncoder(cols=['fsa'])
#enc = ce.TargetEncoder(cols=l_cat_features + ['fsa'])

In [None]:
# transform the datasets
# X_train = enc.fit_transform(X_train, y_train)
# X_test = enc.transform(X_test)
# X_val = enc.transform(X_val)

#### Sampling

In [None]:
#!pip install imblearn

In [None]:
#from imblearn.over_sampling import RandomOverSampler, SMOTE

In [None]:
# Apply random oversampling to balance the dataset
#sampling = RandomOverSampler(random_state=42, sampling_strategy=sampling_strategy)
# sampling = SMOTE(random_state=42)

# X_train_resampled, y_train_resampled = sampling.fit_resample(X_train.astype('float'), y_train)

In [None]:
#y_train_resampled.value_counts()

#### XGBoost

In [None]:
#!pip install xgboost
import xgboost as xgb

##### Tunning

In [None]:
#!pip install optuna
#import optuna

In [None]:
# Define the objective function for Optuna
# def objective(trial):
#     # Define the hyperparameters to tune
#     params = {
#         'objective': 'multi:softmax',
#         'eval_metric': 'mlogloss',
#         'num_class': 9,
#         'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
#         'max_depth': trial.suggest_int('max_depth', 3, 10),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
#         'subsample': trial.suggest_uniform('subsample', 0.5, 1),
#         'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1),
#         'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 10),
#         'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 10),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
#     }

#     # Train the XGBoost model with the current set of hyperparameters
#     dtrain = xgb.DMatrix(X_train, label=y_train)
#     dval = xgb.DMatrix(X_val, label=y_val)
#     model = xgb.train(params, dtrain, num_boost_round=100)

#     # Make predictions on the validation set
#     y_pred = model.predict(dval)

#     # Calculate the accuracy score
#     accuracy = accuracy_score(y_val, y_pred)

#     return 1 - accuracy

In [None]:
# Define the Optuna study
#study = optuna.create_study(direction='minimize')

In [None]:
# Start the optimization process
#study.optimize(objective, n_trials=100)

In [None]:
# Get the best hyperparameters and the best score
# best_params = study.best_params
# best_score = study.best_value

# print("Best Hyperparameters:", best_params)
# print("Best Score:", best_score)

In [None]:
# best_params.update({
#     'objective': 'multi:softmax',
#     'eval_metric': 'mlogloss',
#     'num_class': 9
# })

In [None]:
#best_params

In [None]:
# xgb_model = xgb.XGBClassifier(
#     best_params
# )

In [None]:
# Train the best model with the best hyperparameters
#best_model = xgb.train(best_params, xgb.DMatrix(X_train, label=y_train), num_boost_round=100)

In [None]:
# # hyperparamètres
# params = {
#     "colsample_bytree": np.arange(0.5, 1.0, 0.1),
#     "gamma": uniform(0, 0.5),
#     "learning_rate": [0.01, 0.1, 0.2, 0.3], # default 0.1 
#     "max_depth": randint(3, 7), # default 3
#     "n_estimators": randint(100, 200), # default 100
#     "subsample": np.arange(0.5, 1.0, 0.1),
#     "min_child_weight" : range(1,6,2),
#     "objective": 'multi:softproba'
# }

In [None]:
# search = RandomizedSearchCV(
#     xgb_model, 
#     param_distributions=params, 
#     random_state=42, 
#     n_iter=20, 
#     cv=5, 
#     verbose=3, 
#     n_jobs=1,
#     scoring='accuracy', 
#     return_train_score=True
# )

##### Training

In [None]:
#search_result = search.fit(X_train, y_train)

In [None]:
# print("Best parameters:", search_result.best_params_)
# print("Best score: ", search.best_score_)

# xgb_best_model = search_result.best_estimator_

In [None]:
# xgb_model = xgb.XGBClassifier()

In [None]:
# #xgb_model.fit(X_train_resampled, y_train_resampled)
# xgb_model.fit(X_train, y_train)

#### XGBoost - Josh

In [None]:

# build model and fit in training data
import xgboost as xgb
from sklearn.metrics import roc_auc_score

xgb_model = xgb.XGBClassifier(
    learning_rate=0.05,
    n_estimators=1000,
    max_depth=7, 
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softproba',
    num_class=3, 
    eval_metric='mlogloss', 
    nthread=4,
    scale_pos_weight=1,
    seed=27,
    early_stopping_rounds=5,
    verbose=1
)

# xgb_model = xgb.XGBClassifier(
#     learning_rate=0.1,
#     n_estimators=1000,
#     max_depth=5,
#     min_child_weight=1,
#     gamma=0,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     objective='multi:softproba',
#     num_class=3, 
#     eval_metric='mlogloss', 
#     nthread=4,
#     scale_pos_weight=1,
#     seed=27
# )

# xgb_model.fit(X_train, y_train)

xgb_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)]) 


print('xgb training done')


In [None]:
y_pred = xgb_model.predict_proba(X_val, ntree_limit=xgb_model.best_iteration)

y_pred_0 = y_pred[:, 0]
y_pred_1 = y_pred[:, 1]
y_pred_2 = y_pred[:, 2]
y_pred_3 = y_pred[:, 3]
y_pred_4 = y_pred[:, 4]
y_pred_5 = y_pred[:, 5]
y_pred_6 = y_pred[:, 6]
y_pred_7 = y_pred[:, 7]
y_pred_8 = y_pred[:, 8]
y_pred_9 = y_pred[:, 9]

print(y_pred)


In [None]:
# #predictions on X_val
# y_pred = xgb_model.predict_proba(X_val, ntree_limit=xgb_model.best_iteration)[:, 1]

df_ban_val = ban_val
df_val_exp = df_ban_val.join(X_val) 
df_val_exp['y_test'] = y_val
df_val_exp['y_pred_proba_0'] = y_pred_0
df_val_exp['y_pred_proba_1'] = y_pred_1
df_val_exp['y_pred_proba_2'] = y_pred_2
df_val_exp['y_pred_proba_3'] = y_pred_3
df_val_exp['y_pred_proba_4'] = y_pred_4
df_val_exp['y_pred_proba_5'] = y_pred_5
df_val_exp['y_pred_proba_6'] = y_pred_6
df_val_exp['y_pred_proba_7'] = y_pred_7
df_val_exp['y_pred_proba_8'] = y_pred_8
df_val_exp['y_pred_proba_9'] = y_pred_9
# df_val_exp['y_pred_proba_10'] = y_pred_10
# df_val_exp['y_pred_proba_11'] = y_pred_11
# df_val_exp['y_pred_proba_12'] = y_pred_12

df_val_exp.to_csv("gs://divg-groovyhoon-pr-d2eab4-default/downloads/df_val_exp.csv")

In [None]:
y_pred = xgb_model.predict_proba(X_test, ntree_limit=xgb_model.best_iteration)

y_pred_0 = y_pred[:, 0]
y_pred_1 = y_pred[:, 1]
y_pred_2 = y_pred[:, 2]
y_pred_3 = y_pred[:, 3]
y_pred_4 = y_pred[:, 4]
y_pred_5 = y_pred[:, 5]
y_pred_6 = y_pred[:, 6]
y_pred_7 = y_pred[:, 7]
y_pred_8 = y_pred[:, 8]
y_pred_9 = y_pred[:, 9]

print(y_pred)

In [None]:
# #predictions on X_test
# y_pred = xgb_model.predict_proba(X_test, ntree_limit=xgb_model.best_iteration)[:, 1]

df_ban_test = ban_test
df_test_exp = df_ban_test.join(X_test) 
df_test_exp['y_test'] = y_test
df_test_exp['y_pred_proba_0'] = y_pred_0
df_test_exp['y_pred_proba_1'] = y_pred_1
df_test_exp['y_pred_proba_2'] = y_pred_2
df_test_exp['y_pred_proba_3'] = y_pred_3
df_test_exp['y_pred_proba_4'] = y_pred_4
df_test_exp['y_pred_proba_5'] = y_pred_5
df_test_exp['y_pred_proba_6'] = y_pred_6
df_test_exp['y_pred_proba_7'] = y_pred_7
df_test_exp['y_pred_proba_8'] = y_pred_8

df_test_exp['y_pred_proba_9'] = y_pred_9
# df_test_exp['y_pred_proba_10'] = y_pred_10
# df_test_exp['y_pred_proba_11'] = y_pred_11
# df_test_exp['y_pred_proba_12'] = y_pred_12

df_test_exp.to_csv("gs://divg-groovyhoon-pr-d2eab4-default/downloads/df_test_exp.csv")

#### CatBoost

In [None]:
#!pip install catboost

In [None]:
#from catboost import CatBoostClassifier

In [None]:
#cat_model = CatBoostClassifier()

In [None]:
#cat_model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=10) 

#### LightGBM

In [None]:
#!pip install lightgbm

In [None]:
#from lightgbm import LGBMClassifier

In [None]:
#lgbm = LGBMClassifier(learning_rate=0.1, n_estimators=100, num_leaves=31)

In [None]:
#lgbm.fit(X_train_resampled, y_train_resampled)

In [None]:
#lgbm.fit(X_train, y_train)

#### Results

In [None]:
# d_models_mapping = {
#     'XGBClassifier': xgb_model,
#     #'LGBMClassifier': lgbm,
#     'CatBoost': cat_model,
#     # 'OneVsRestClassifier': xgb_best_model,
#     # 'OneVsOneClassifier': xgb_best_model,
#     #'RandomForest': rf_model,
#     #'SVM': svm,
#     #'Logistic Regression': lr,
#     # 'XGBClassifier': xgb_best_model,
#     # 'DeepLearning': xgb_best_model,
# }

In [None]:
n= 3
probabilities =  xgb_model.predict_proba(X_val)
results_ranked = np.argsort(-probabilities, axis=1)
display(extract_stats(n, results_ranked, y_val, d_target_mapping))

#### Features Importance

In [None]:
# Retrieve feature importance scores
importance_scores = xgb_model.feature_importances_
feature_names = X_train.columns

# Sort feature importance scores and feature names in descending order
sorted_indices = importance_scores.argsort()
sorted_scores = importance_scores[sorted_indices][-30:]
sorted_names = feature_names[sorted_indices][-30:]

# Plot feature importance
plt.figure(figsize=(8, 8))
plt.barh(range(len(sorted_scores)), sorted_scores)
plt.yticks(range(len(sorted_scores)), sorted_names)
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.title('Feature Importance')
plt.show()

#### Correlation

In [None]:
!pip install seaborn
import seaborn as sns

In [None]:
# seaborn heatmap around the correlation matrix.
plt.figure(figsize=[40,40])
sns.heatmap(
    X.corr(numeric_only=True), 
    annot=True, 
    fmt='.2f',
    square=True, 
    cmap='vlag', 
    center=0, 
)