In [1]:
import sys
sys.path.append('../..')

import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd, numpy as np, gc
from tqdm import tqdm
import joblib
import dask.multiprocessing
import cudf, cupy, time
from numba import jit, njit, prange
from sklearn.metrics import precision_recall_curve, auc, log_loss
# from sklearn.model_selection import train_test_split
from dask_ml.model_selection import train_test_split

dask.config.set(schedular='process')
pd.set_option('display.max_columns', 500)

from utils.cuda_cluster import *
from utils.preprocessing import read_data, factorize_small_cardinality
from utils.util import *
from utils.evaluate import calculate_ctr, compute_rce, average_precision_score


import core.config as conf

Perhaps you already have a cluster running?
Hosting the HTTP server on port 46165 instead
  http_address["port"], self.http_server.port


In [4]:
path = f'{conf.preproc_path}/train/part-00250.parquet'
train = read_data(path)
gc.collect()
save_memory( train )

In [5]:
label_names = ['reply', 'retweet', 'retweet_comment', 'like']
DONT_USE = ['timestamp','creator_account_creation','engager_account_creation','engage_time',
            'creator_account_creation', 'engager_account_creation',
            'fold','tweet_id', 
            'tr','dt_day','','',
            'engager_id','creator_id','engager_is_verified',
            'elapsed_time',
            'links','domains','hashtags0','hashtags1',
            'hashtags','tweet_hash','dt_second','id',
            'tw_hash0',
            'tw_hash1',
            'tw_rt_uhash',
            'same_language', 'nan_language','language',
            'tw_hash', 'tw_freq_hash','tw_first_word', 'tw_second_word', 'tw_last_word', 'tw_llast_word',
            'ypred','creator_count_combined','creator_user_fer_count_delta_time','creator_user_fing_count_delta_time','creator_user_fering_count_delta_time','creator_user_fing_count_mode','creator_user_fer_count_mode','creator_user_fering_count_mode'
            
           ]
DONT_USE += label_names
features = [c for c in train.columns if c not in DONT_USE]

print('Using %i features:'%(len(features)))
np.asarray(features)

Using 11 features:


array(['creator_follower_count', 'creator_following_count',
       'tweet_timestamp', 'dt_dow', 'dt_hour', 'tweet_type', 'media',
       'TE_engager_id_like', 'TE_engager_id_tweet_type_language_like',
       'TE_creator_id_like', 'TE_domains_media_tweet_type_language_like'],
      dtype='<U41')

In [6]:
train0, test = train_test_split(train, test_size=0.2, random_state=777, shuffle=False)
train, valid = train_test_split(train0, test_size=0.2, random_state=777, shuffle=False)
test0, test1 = train_test_split(test, test_size=0.5, random_state=777, shuffle=False)

In [7]:
len(train), len(valid), len(test0), len(test1)

(1176943, 293669, 183803, 183952)

In [8]:
xgb_parms = { 
    'max_depth':8, 
    'learning_rate':0.025, 
    'subsample':0.85,
    'colsample_bytree':0.35, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'tree_method':'gpu_hist',
    #'predictor': 'gpu_predictor',
    'seed': 1,
}

import xgboost as xgb
print('XGB Version',xgb.__version__)

XGB Version 1.3.3


In [9]:
# CREATE TRAIN AND VALIDATION SETS
RMV = [c for c in DONT_USE if c in train.columns]

In [10]:
#LEarning rates for 'reply', 'retweet', 'retweet_comment', 'like'
LR = [0.05,0.03,0.07,0.01]

#Like
xgb_parms['learning_rate'] = LR[3]
TARGET = 'like'
print('#'*25);print('###',TARGET);print('#'*25)

dtrain = xgb.DMatrix(data=train.drop(RMV, axis=1).compute().to_pandas() ,label=train[TARGET].compute().values)
gc.collect()

model = xgb.train(xgb_parms, 
                  dtrain=dtrain,
                  num_boost_round=500,
                 ) 

del dtrain
gc.collect()  

#save model
path = f'/hdd/models'
joblib.dump(model, f'{path}/model-'+TARGET+'-1.xgb' ) 
del model
gc.collect()  

#########################
### like
#########################


9

In [11]:
model = joblib.load( f'{path}/model-'+TARGET+'-1.xgb' )
dtrain = xgb.DMatrix(data=train.drop(RMV, axis=1).compute().to_pandas() ,label=train[TARGET].compute().values)
train = train.compute()
train['ypred'] = model.predict(dtrain)
del dtrain, model
_=gc.collect()

In [12]:
model = joblib.load( f'{path}/model-'+TARGET+'-1.xgb' )
dvalid = xgb.DMatrix(data=valid.drop(RMV, axis=1).compute().to_pandas() ,label=valid[TARGET].compute().values)
valid = valid.compute()
valid['ypred'] = model.predict(dvalid)
del dvalid, model
_=gc.collect()

In [13]:
train.head()

Unnamed: 0_level_0,creator_id,engager_id,tweet_id,creator_follower_count,creator_following_count,tweet_timestamp,reply,retweet,like,dt_day,dt_dow,dt_hour,language,tweet_type,media,domains,TE_engager_id_like,TE_engager_id_tweet_type_language_like,TE_creator_id_like,TE_domains_media_tweet_type_language_like,ypred
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,F573BB89C6E4BF9CD0884AFBB45003EE,1D990BC200E4A9C1F1F46F03D3A16EE6,B19367F2D0A6F4177A4FE70EEF37F9C1,208,16,1614031094,0,0,0,22,0,21,58,1,0,7600,0.375623673,,,0.264406294,0.245653
2,558D549648F2219DE58AF2A6F5945130,E2CEA3C60D1C401F13BF74A4BC5CF289,6349ED17CBEA67D49D93BD06DF0ED103,18896,999,1612415131,0,0,0,4,3,5,19,2,0,0,0.0,0.0,0.0,0.0,0.425895
3,7F060D677FA53211C903016305F3BF77,FD7D8B0CBEC9EED3F4BD14E8292A1456,277C83C44D341B1F0AD548E8516F3320,293813,14,1613560362,0,0,1,17,2,11,44,2,4,5790,,,0.404004425,,0.296927
4,F2BE120C25500085356B91706F332AC5,469B4475BB89CF461DCDAFA37EAA4E3A,4E4186C715710C1E8C195896AB014480,938,123,1612582926,0,0,0,6,5,3,19,1,0,0,,,,0.246721998,0.283123
5,7263249D4A8E1A4E17ECFB95CD6DD3E3,EA920AD7AD41B98498968BC6F8CEB0B9,CD2C55F0420A5FE4CA8A07E8F7CFD205,10637,3122,1613852360,0,0,0,20,5,20,19,1,0,0,,,0.449458957,0.24616152,0.404068


In [14]:
valid.head()

Unnamed: 0_level_0,creator_id,engager_id,tweet_id,creator_follower_count,creator_following_count,tweet_timestamp,reply,retweet,like,dt_day,dt_dow,dt_hour,language,tweet_type,media,domains,TE_engager_id_like,TE_engager_id_tweet_type_language_like,TE_creator_id_like,TE_domains_media_tweet_type_language_like,ypred
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,25438C5B9A2CCEACC6CCD142133F4347,DC89E189F6C7D7D2E46CB13A5B8B7F9E,B6F105BB4528F7E2C9A7F2CF4D39E413,1947,2443,1612476238,0,0,1,4,3,22,19,2,0,0,0.0,0.0,0.0,0.0,0.429066
15,58848672FBEE358AB36F8BA118C89AA4,24BF9F89EAADAE215160F0893A36F670,B339DD6677AE2099647564663383A457,1054,331,1612525170,0,0,0,5,4,11,19,1,0,0,0.0,0.0,0.0,0.0,0.276474
16,B7B85001F0DD6D7214AC79BCEEF15880,53D7CD0D42B6DC648F88CCE7A74BEF1F,0BB214960CC86D6AAFD264337A5108A8,217,312,1613312389,0,0,0,14,6,14,10,1,0,0,,,,0.138013,0.185827
23,7ED06258B2170020067DAA1AE3BF71CB,364247E6EDDEED0CCD92D17A41D75F3A,2B456087C228A69A6066E093499BE0DB,4173,4879,1612466342,0,0,0,4,3,19,10,2,0,15581,0.0,0.0,0.0,0.0,0.432372
34,BB793C5816CC0750EBE254FADA02C997,D541F2C5300D58F7E8B69C0DBD20CCE7,F4EB5A5CFF92053074A43C0CD42FCE94,212971,270,1613527910,0,0,0,17,2,2,19,2,0,0,,,0.45367071,0.524736,0.574264


In [15]:
valid[['like', 'ypred']]

Unnamed: 0_level_0,like,ypred
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10,1,0.429066
15,0,0.276474
16,0,0.185827
23,0,0.432372
34,0,0.574264
...,...,...
1837933,0,0.522132
1837949,0,0.493512
1837965,1,0.631236
1837970,1,0.284582


In [16]:
rce_like = compute_rce(valid['ypred'].to_array(), valid['like'].to_array())
rce_like

10.061644930151159

In [17]:
pred = valid['ypred'].to_array()
pred = list(map(lambda x: 1 if x > 0.5 else 0,  pred))
valid['ypred'] = pred
ap_like = average_precision_score(valid['ypred'].to_array(), valid['like'].to_array())
ap_like

0.3466263819755379