In [2]:
import pandas as pd
import numpy as np
from numpy import percentile
import matplotlib.pyplot as plt
from src.features import preprocess,raw
from src.utils.config import TRAIN_PATH_CLICK,TRAIN_PATH_SMS,TEST_PATH
from src.train import pipeline
%matplotlib inline

In [3]:
datap = 's3://data-lake-v2/cdp_clients_data/fsi/processed_data/stanbic_credit_scoring_updated/part-00000-28facaeb-3c6b-4b4b-a539-b569a6c978a8-c000.csv'
data = pd.read_csv(datap)

## VALIDATION DATA

In [None]:
preprocess.preprocess(train=False,validation_path='./data/validation.csv')

In [None]:
from utils.config import PROCESSED_TEST_PATH
data = pd.read_pickle(PROCESSED_TEST_PATH)

## TRAINING DATA FROM S3

In [None]:
import s3fs
fs = s3fs.S3FileSystem()

In [None]:
raw_data = [ pd.read_csv('s3://'+file) for file in fs.ls('s3://datateam-ml/Adrenaline-November-CTR/data/train') ]
data = pd.concat(raw_data)
data.reset_index(drop=True, inplace=True)
#preprocess the input file and save the transformed data as a pickle file
# preprocess.preprocess(dataframe=data,train=True)

In [None]:
data = preprocess.handle_nan(data,fillna='missing',drop_outliers=True)

In [None]:
data = data.reindex(columns = input_columns)

In [None]:
import yaml
def load_attributes(data):
    num_attributes = data.select_dtypes(include=[np.number]).columns.tolist()
    cat_attributes = data.select_dtypes(exclude=[np.number]).columns.tolist()
    try:
        num_attributes.remove("event_type")
        num_attributes.remove('customer_class')
        cat_attributes.append('customer_class')
        data['customer_class'] = self.data['customer_class'].astype(str)
    except:
        pass
    return num_attributes, cat_attributes
def identify_columns(data, high_dim=100, verbose=True, save_output=True):
    
    """
        
        This funtion takes in the data, identify the numerical and categorical
        attributes and stores them in a list
        
    """
    num_attributes, cat_attributes = load_attributes(data)
        
    low_cat = []
    hash_features = []
    dict_file = {}
    input_columns = [cols for cols in data.columns]
    input_columns.remove('event_type')
    input_columns.remove('msisdn.1')
    for item in cat_attributes:
        if data[item].nunique() > high_dim:
            if verbose:
                print('\n {} has a high cardinality. It has {} unique attributes'.format(item, data[item].nunique()))
            hash_features.append(item)
        else:
            low_cat.append(item)
    if save_output:
        dict_file['num_feat'] = num_attributes
        dict_file['cat_feat'] = cat_attributes
        dict_file['hash_feat'] = hash_features
        dict_file['lower_cat'] = low_cat
        dict_file['input_columns'] = input_columns
        store_attribute(dict_file)
        print('\nDone!. Data columns successfully identified and attributes are stored in /data/')
def store_attribute(dict_file):
    with open(r'./data/store_file.yaml', 'w') as file:
        documents = yaml.dump(dict_file, file)

In [None]:
from src.utils.utils import input_columns

In [None]:
identify_columns(data)

In [4]:
dataframe = data

In [5]:
dataframe = dataframe.rename(columns={"bad_behaviour": "event_type"})
dataframe = dataframe[~dataframe['customer_class'].isnull()]
# dataframe = dataframe.dropna(thresh=data.shape[1]*0.6)
dataframe.loc[:,'event_type'] = dataframe['event_type'].map({False:1, True:0})
dataframe = preprocess.drop_cols(dataframe,columns=['msisdn','max_disbursement', 'max_extensions', 'max_dpd', 'telco', 'profile_identity', 'updated', 
                                                    'religion','occupation','social_media_presence', 'is_mother', 'phone_on_status', 
                                                    'roaming_status', 'sim_reg_status', 'sim_dnd_status', 'inbound_daily_count', 
                                                    'inbound_monthly_count', 'outbound_daily_count', 'outbound_monthly_count',
                                                    'interactions_sms', 'roam_revenue', 'investment_score', 'ctr_score','interactions_click',
                                                    'interaction_conversion'])
dataframe = preprocess.handle_nan(dataframe,fillna='missing',drop_outliers=True)


 Features       Count of missing value
Series([], dtype: int64)


In [None]:
import sagemaker


sagemaker.Session().upload_data(bucket='datateam-ml', 
                              path='/home/ec2-user/SageMaker/Accessbank_CTR/Catboost-local/data/train', 
                              key_prefix='Adrenaline-November-CTR/data/train')

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 15000)

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
params = {
      'model': {
          'iterations':500, 
          "depth" :6, 
          "learning_rate":0.3, 
          "l2_leaf_reg": 10, 
          "loss_function":'Logloss',
          "eval_metric":'AUC'
      },

      'fit': {
        'early_stopping_rounds': 10,
        'verbose': 10
      },

      'fold': {
        'n_splits': 5,
        'shuffle': True,
        'random_state': 0
      }
    }

In [None]:
fold_params = params['fold']
model_params = params['model']
fit_params = params['fit']

In [None]:
skf = StratifiedKFold(**fold_params)

In [None]:
from sklearn.model_selection import train_test_split
y = dataframe['event_type'].astype(int)
X = dataframe.drop(['event_type'], axis=1).sample(n=9000, replace=True, random_state=1)

In [None]:
positive_indices = dataframe[dataframe.event_type == 1].index
# sample_size = len(dataframe[dataframe.event_type == 0])
random_indices = np.random.choice(positive_indices, 7500, replace=False)
good = dataframe.loc[random_indices]
bad = dataframe[dataframe['event_type']==0]

In [None]:
bad.reset_index(drop=True, inplace=True)
good.reset_index(drop=True, inplace=True)

In [None]:
new_data = pd.concat([good,bad])

In [None]:
from sklearn.model_selection import train_test_split
y = new_data['event_type'].astype(int)
X = new_data.drop(['event_type'], axis=1)

## MODEL BUILDING FOR DIFFERENT ALG

In [6]:
X_train, X_valid, y_train, y_valid, lgb_pipeline = pipeline.fit_transform(dataframe,hash_size=200,test_size=0.20)


 location_lga has a high cardinality. It has 658 unique attributes

 device_manufacturer has a high cardinality. It has 338 unique attributes

 device_model has a high cardinality. It has 2638 unique attributes

Done!. Data columns successfully identified and attributes are stored in /data/
['customer_value', 'gender', 'vas_subscriber', 'location_region', 'location_state', 'device_type', 'os_vendor', 'os_name', 'os_version', 'customer_class']
(23378, 665)


In [10]:
rfc = RandomForestClassifier(random_state=42,n_estimators=200)

In [11]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [12]:
X_valid = X_test
y_valid = y_test

NameError: name 'X_test' is not defined

In [14]:
train_predictions = rfc.predict(X_valid)
# print(rfc)
# print("model score: %.3f" % classifier.score(X_test, y_test))
print('confusion matrix')
print(metrics.confusion_matrix(y_valid.astype(int), train_predictions.astype(int)))
print('classification report')
print(metrics.classification_report(y_valid.astype(int), train_predictions.astype(int)))
print('Accuracy : %f' % (metrics.accuracy_score(y_valid.astype(int), train_predictions.astype(int))))
print('f1 score : %f' % (metrics.fbeta_score(y_valid.astype(int), train_predictions.astype(int), beta=1)))

confusion matrix
[[  25 1240]
 [  20 4560]]
classification report
              precision    recall  f1-score   support

           0       0.56      0.02      0.04      1265
           1       0.79      1.00      0.88      4580

    accuracy                           0.78      5845
   macro avg       0.67      0.51      0.46      5845
weighted avg       0.74      0.78      0.70      5845

Accuracy : 0.784431
f1 score : 0.878613


In [None]:
from utils.utils import store_model,load_pickle

In [None]:
store_model('./model/lgb-model.pkl','./model/lgb-pipeline.pkl',(lgb,lgb_pipeline))

## CATBOOST MODELLING

In [None]:
import numpy as np
cate_features_index = np.where(X.dtypes != float)[0]

In [8]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [9]:
from catboost import Pool, CatBoostClassifier, cv
from utils.utils import load_pickle, print_devider
from train.train import get_scores, log_plot
from utils import plot_funcs as pf

from sklearn.metrics import (accuracy_score,
                             precision_score,
                             recall_score,
                             f1_score,
                             confusion_matrix,
                             roc_curve,
                             roc_auc_score,
                             precision_recall_curve,
                             average_precision_score)

In [None]:
def get_scores(y_true, y_pred):
    return {
      'accuracy': accuracy_score(y_true, y_pred),
      'precision': precision_score(y_true, y_pred),
      'recall': recall_score(y_true, y_pred),
      'f1': f1_score(y_true, y_pred),
    }

def devide_by_sum(x):
    return x / x.sum()

def log_plot(args, plot_func, fp):
    if not isinstance(args, (tuple)):
        args = (args,)

    plot_func(*args, fp)
    mlflow.log_artifact(fp)
    os.remove(fp)
    print(f'Logged {fp}')

In [None]:
feature_importances = np.zeros(X.shape[1])
for fold_no, (idx_train, idx_valid) in enumerate(skf.split(X, y)):
    print_devider(f'Fold: {fold_no}')

    X_train, X_valid = X.iloc[idx_train, :], X.iloc[idx_valid, :]
    y_train, y_valid = y.iloc[idx_train], y.iloc[idx_valid]
    model = CatBoostClassifier(**model_params)
    model.fit(X_train, y_train,cat_features=cate_features_index,**fit_params,eval_set=(X_valid, y_valid))
    feature_importances += devide_by_sum(model.feature_importances_) / skf.n_splits
    train_predictions = model.predict(X_valid)
    # evaluate
    scores_valid = get_scores(y_valid, train_predictions)
    print()
    # print("model score: %.3f" % classifier.score(X_test, y_test))
    print('confusion matrix')
    print(metrics.confusion_matrix(y_valid.astype(int), train_predictions.astype(int)))
    print('classification report')
    print(metrics.classification_report(y_valid.astype(int), train_predictions.astype(int)))
    print('Accuracy : %f' % (metrics.accuracy_score(y_valid.astype(int), train_predictions.astype(int))))
    print('f1 score : %f' % (metrics.fbeta_score(y_valid.astype(int), train_predictions.astype(int), beta=1)))
    
    # After you train the model using fit(), save like this - 
    model.save_model('model_name')    # extension not required.

In [None]:
feature_importances

In [None]:
np.array(model.feature_names_)

In [None]:
4536+15+32+1224

In [None]:
y.shape[0]/5

In [None]:
print(model)
# print("model score: %.3f" % classifier.score(X_test, y_test))
print('confusion matrix')
print(metrics.confusion_matrix(y_test.astype(int), train_predictions.astype(int)))
print('classification report')
print(metrics.classification_report(y_test.astype(int), train_predictions.astype(int)))
print('Accuracy : %f' % (metrics.accuracy_score(y_test.astype(int), train_predictions.astype(int))))
print('f1 score : %f' % (metrics.fbeta_score(y_test.astype(int), train_predictions.astype(int), beta=1)))

In [None]:
# cd_test = preprocess.clean_data(keep_data)
# preprocess.map_target(keep_data,'event_type')
y = keep_data['event_type']
# keep_data = preprocess.handle_nan(keep_data,fillna='missing',drop_outliers=True)
preprocess.drop_cols(keep_data,['event_type'])

In [None]:
# And then, later load - 
from catboost import CatBoostClassifier
model = CatBoostClassifier()      # parameters not required.
model.load_model('mlruns/1/fa5d90cfe4574d64b7ebf5325b09810f/artifacts/model_name')

In [None]:
dt = full_pipeline.transform(keep_data)

In [None]:
results = rf.predict(dt)

In [None]:
model_list = [rfc,model]

In [None]:
full_pipeline = load_pickle('./model/pipeline.pkl')

In [None]:
rf = load_pickle('./model/rfc-model.pkl')

In [None]:
model_list = [model,rf,lgb]

In [None]:
!rm -rf mlruns/2/68681dbccaea43268478bdd2b1e6b12f/

In [None]:
import mlflow,os
from collections import defaultdict
mlflow.set_experiment('Access bank CTR exp')

with mlflow.start_run(run_name='Access-bank-model-exp') as run:
    data = raw.read_data('./data/validation.csv')
    preprocess.map_target(data,'event_type')
    y_valid = data['event_type']
    data = preprocess.handle_nan(data,fillna='missing',drop_outliers=True)
    
    roc_list = {}
    for model in model_list:
        print(f'\n{model.__class__.__name__}')
        if model.__class__.__name__== 'CatBoostClassifier':
            preprocess.drop_cols(data,['msisdn.1','event_type'])
            new_data = data
        elif model.__class__.__name__== 'RandomForestClassifier':
            preprocess.drop_cols(data,['msisdn'])#'msisdn.1','event_type',
            new_data = full_pipeline.transform(data)
        else:
            new_data = lgb_pipeline.transform(data)
        
        # predict
        y_valid_proba = model.predict_proba(new_data)[:, 1]
        y_valid_pred = model.predict(new_data)

        scores = defaultdict(int)
        # evaluate
        scores_valid = get_scores(y_valid, y_valid_pred)

        # record scores
        for k, v in scores_valid.items():
            scores[k] += v 

        # scores
        log_plot(scores, pf.scores, f'{model.__class__.__name__}-scores.png')

        # confusion matrix
        cm = metrics.confusion_matrix(y_valid, y_valid_pred)
        log_plot(cm, pf.confusion_matrix, f'{model.__class__.__name__}-confusion_matrix.png')
        
        # roc curve
        fpr, tpr, _ = roc_curve(y_valid, y_valid_proba)
        roc_auc = roc_auc_score(y_valid, y_valid_pred)
        roc_list[f'{model.__class__.__name__}'] = [fpr,tpr,roc_auc]
        
    log_plot((model_list,roc_list), pf.multiple_roc_curve, 'roc_curve.png')

In [None]:
data = raw.read_data(path='./data/validation.csv')
preprocess.map_target(data,'event_type')
data = preprocess.handle_nan(data,fillna='missing',drop_outliers=True)

In [None]:
# cd_test = preprocess.clean_data(keep_data)
# preprocess.map_target(keep_data,'event_type')
y = data['event_type']
# keep_data = preprocess.handle_nan(keep_data,fillna='missing',drop_outliers=True)
test = preprocess.drop_cols(data,['msisdn.1','event_type'])

In [None]:
results = model.predict(test)

In [None]:
# print(model)
# print("model score: %.3f" % classifier.score(X_test, y_test))
print('confusion matrix')
print(metrics.confusion_matrix(y.astype(int), results.astype(int)))
print('classification report')
print(metrics.classification_report(y.astype(int), results.astype(int)))
print('Accuracy : %f' % (metrics.accuracy_score(y.astype(int), results.astype(int))))
print('f1 score : %f' % (metrics.fbeta_score(y.astype(int), results.astype(int), beta=1)))

In [None]:
!rm -rf /home/ec2-user/SageMaker/Accessbank_CTR/Catboost-local/foo

In [None]:
import shutil


shutil.copy('/home/ec2-user/SageMaker/Accessbank_CTR/Catboost-local/test.csv', '/home/ec2-user/SageMaker/Accessbank_CTR/Catboost-sagemaker/container/local_test/test_dir/input/data/training/')

In [None]:
model = CatBoostClassifier(iterations=500, depth=6, learning_rate=0.3, l2_leaf_reg = 10, loss_function='Logloss',eval_metric='AUC')
model.fit(X_train, y_train,cat_features=cate_features_index,eval_set=(X_test, y_test),early_stopping_rounds=10,plot=True)

In [None]:
y_valid_pred = model.predict(X_test)

In [None]:
np.round(y_valid_pred,3)

In [None]:
pd.DataFrame({'msisdn':y_valid.index,'results':y_valid_pred.flatten()}).to_csv('hey.csv',index=False,
                                                                       sep='|',header=['msisdn','ctr_access_bank'])

In [None]:
from train.train import get_scores
import utils.plot_funcs as pf

In [None]:
scores_valid = get_scores(y_valid, y_valid_pred)

In [None]:
def devide_by_sum(x):
    return x / x.sum()

In [None]:
devide_by_sum(model.feature_importances_)

In [None]:
mis = defaultdict(int)
for i in range(0,5):
    for k, v in scores_valid.items():
        mis[k]+= v/5

In [None]:
skf.n_splits

In [None]:
mis

In [None]:
pf.scores(mis,'scores.png')

In [None]:
features = np.array(model.feature_names_)
pf.feature_importance(features, feature_importances, 'Feature Importance','feature_importance.png')

In [None]:
metrics = []
metrics.append({
              'name': ['AUC'],
              'values': model.evals_result_['learn']['Precision'],
              'best_iteration': model.best_iteration_})

In [None]:
pf.metric(metrics, 'metric_history.png')

In [None]:
cm = confusion_matrix(y_valid, y_valid_pred)
pf.confusion_matrix(cm, 'confusion_matrix.png')

In [None]:
skf.__class__.__name__

In [None]:
import os
os.remove('confusion_matrix.png')

In [None]:
# model.evals_result_['learn'][]