In [1]:
import sys
sys.path.append('..')

import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd, numpy as np, gc
from tqdm import tqdm
import joblib
import dask.multiprocessing
import cudf, cupy, time
from numba import jit, njit, prange
from sklearn.metrics import precision_recall_curve, auc, log_loss
# from sklearn.model_selection import train_test_split
from dask_ml.model_selection import train_test_split

dask.config.set(schedular='process')
pd.set_option('display.max_columns', 500)

from utils.cuda_cluster import *
from utils.preprocessing import read_data, factorize_small_cardinality
from utils.util import *


import core.config as conf

Perhaps you already have a cluster running?
Hosting the HTTP server on port 45975 instead
  http_address["port"], self.http_server.port


In [2]:
path = f'{conf.preproc_path}/train/part-00001.parquet'
train = read_data(path)
gc.collect()
save_memory( train )

In [3]:
label_names = ['reply', 'retweet', 'retweet_comment', 'like']
DONT_USE = ['timestamp','creator_account_creation','engager_account_creation','engage_time',
            'creator_account_creation', 'engager_account_creation',
            'fold','tweet_id', 
            'tr','dt_day','','',
            'engager_id','creator_id','engager_is_verified',
            'elapsed_time',
            'links','domains','hashtags0','hashtags1',
            'hashtags','tweet_hash','dt_second','id',
            'tw_hash0',
            'tw_hash1',
            'tw_rt_uhash',
            'same_language', 'nan_language','language',
            'tw_hash', 'tw_freq_hash','tw_first_word', 'tw_second_word', 'tw_last_word', 'tw_llast_word',
            'ypred','creator_count_combined','creator_user_fer_count_delta_time','creator_user_fing_count_delta_time','creator_user_fering_count_delta_time','creator_user_fing_count_mode','creator_user_fer_count_mode','creator_user_fering_count_mode'
            
           ]
DONT_USE += label_names
features = [c for c in train.columns if c not in DONT_USE]

print('Using %i features:'%(len(features)))
np.asarray(features)

Using 11 features:


array(['creator_follower_count', 'creator_following_count',
       'tweet_timestamp', 'dt_dow', 'dt_hour', 'tweet_type', 'media',
       'TE_engager_id_like', 'TE_engager_id_tweet_type_language_like',
       'TE_creator_id_like', 'TE_domains_media_tweet_type_language_like'],
      dtype='<U41')

In [4]:
train0, test = train_test_split(train, test_size=0.2, random_state=777, shuffle=False)
train, valid = train_test_split(train0, test_size=0.2, random_state=777, shuffle=False)
test0, test1 = train_test_split(test, test_size=0.5, random_state=777, shuffle=False)

In [5]:
len(train), len(valid), len(test0), len(test1)

(1940351, 485929, 303895, 304028)

In [6]:
xgb_parms = { 
    'max_depth':8, 
    'learning_rate':0.025, 
    'subsample':0.85,
    'colsample_bytree':0.35, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'tree_method':'gpu_hist',
    #'predictor': 'gpu_predictor',
    'seed': 1,
}

import xgboost as xgb
print('XGB Version',xgb.__version__)

XGB Version 1.3.3


In [7]:
# CREATE TRAIN AND VALIDATION SETS
RMV = [c for c in DONT_USE if c in train.columns]

In [8]:
#LEarning rates for 'reply', 'retweet', 'retweet_comment', 'like'
LR = [0.05,0.03,0.07,0.01]

#Like
xgb_parms['learning_rate'] = LR[3]
TARGET = 'like'
print('#'*25);print('###',TARGET);print('#'*25)

dtrain = xgb.DMatrix(data=train.drop(RMV, axis=1).compute().to_pandas() ,label=train[TARGET].compute().values)
gc.collect()

model = xgb.train(xgb_parms, 
                  dtrain=dtrain,
                  num_boost_round=500,
                 ) 

del dtrain
gc.collect()  

#save model
path = f'/hdd/models'
joblib.dump(model, f'{path}/model-'+TARGET+'-1.xgb' ) 
del model
gc.collect()  

#########################
### like
#########################


9

In [29]:
model = joblib.load( f'{path}/model-'+TARGET+'-1.xgb' )
dtrain = xgb.DMatrix(data=train.drop(RMV, axis=1).compute().to_pandas() ,label=train[TARGET].compute().values)
train = train.compute()
train['ypred'] = model.predict(dtrain)
del dtrain, model
_=gc.collect()

In [30]:
model = joblib.load( f'{path}/model-'+TARGET+'-1.xgb' )
dvalid = xgb.DMatrix(data=valid.drop(RMV, axis=1).compute().to_pandas() ,label=valid[TARGET].compute().values)
valid = valid.compute()
valid['ypred'] = model.predict(dvalid)
del dvalid, model
_=gc.collect()

In [31]:
train.head()

Unnamed: 0_level_0,creator_id,engager_id,tweet_id,creator_follower_count,creator_following_count,tweet_timestamp,reply,retweet,like,dt_day,dt_dow,dt_hour,language,tweet_type,media,domains,TE_engager_id_like,TE_engager_id_tweet_type_language_like,TE_creator_id_like,TE_domains_media_tweet_type_language_like,ypred
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,593795B66A046F366FA0812684BD7797,C4201D76A9C681C58837CC0D9B93DFF8,343B08D03026C52C85F5E12C7F68F1EE,465,589,1613359840,0,0,0,15,0,3,48,1,0,0,,,,0.250149,0.297264
2,59151EC83753ED66049FD5EBA0597731,5623390A5651756A88F4E2681D0E589D,7D823366B3EE108A110A8ED2192E5822,78985,818,1612532416,0,0,0,5,4,13,48,2,0,0,0.0,0.0,0.0,0.0,0.405553
3,FD2A37001BC20B84288116E892EC593B,E2EE3768D846F7045F045A5346722548,4E19116D1DF976844E4A97EF817C007F,5222,4422,1613930163,0,0,0,21,6,17,48,2,0,0,,,,0.458571,0.380323
4,C191DA726786A709A48B6EB9ED71A8EA,CA88FAA145122A84ECBDE8DEEE9D7ECC,36247CA39A0FD3F0C21961F58BA76B5E,4229,652,1613482520,0,0,0,16,1,13,60,2,0,41351,,,,0.328404,0.307032
5,849A4D7C233104080FDDE88F820B525B,4BB0CE94D9D0FCA8F899C776CEF15E75,5FDCCAB4E74417FAA4F9D59778557665,2268,2162,1614086539,0,0,0,23,1,13,19,1,3,0,,,0.375318944,0.344966,0.294962


In [34]:
valid.head()

Unnamed: 0_level_0,creator_id,engager_id,tweet_id,creator_follower_count,creator_following_count,tweet_timestamp,reply,retweet,like,dt_day,dt_dow,dt_hour,language,tweet_type,media,domains,TE_engager_id_like,TE_engager_id_tweet_type_language_like,TE_creator_id_like,TE_domains_media_tweet_type_language_like,ypred
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,AF83173A15D2CB7A968708B89EE89ED3,6A47A33F79CE79D962D908217FF33878,D060B363AD934A8B1B7C63DE3C628B73,96302,397,1613893756,0,0,0,21,6,7,19,0,0,0,0.358258992,0.375318944,0.375319,0.371144,0.305395
15,43013DD2B145D0EBEC2B9990D05FEDF6,E206C2C68241177AABA2A1F757FB4F76,A997714FA24C892AACBCCEFA9E68A0E5,528754,498,1613842251,0,0,1,20,5,17,19,2,0,0,,,0.227736,0.522969,0.307298
16,CC7614BB76345F063453D794FC5BD969,041120B3ACF628C9F85E455C224A8EB8,F0C2C4472D1AE3E912EC184B69E79395,2157,451,1612516261,0,0,0,5,4,9,62,0,0,0,0.0,0.0,0.0,0.0,0.36204
23,B3242AE2DC6B4D3F14A1117CF9F05F60,97B5B9279A004524C5B183FB91969F36,9E73D94E0157F4DB17FEF8676E427268,864351,260,1613080942,0,0,0,11,3,22,19,1,0,0,,,0.282695,0.250346,0.128103
34,8B56E208B2DC7DDA9017DCF943154239,51B01A412B9AD25CAFA58972C0851032,DBAF8BCD9697F4CDA46AA6538B8329D8,3377649,1,1613133609,0,0,1,12,4,12,61,2,7,11446,,,0.435421,0.410217,0.433228


In [35]:
valid[['like', 'ypred']]

Unnamed: 0_level_0,like,ypred
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10,0,0.305395
15,1,0.307298
16,0,0.362040
23,0,0.128103
34,1,0.433228
...,...,...
3033640,1,0.354002
3033643,0,0.511311
3033647,0,0.446225
3033652,1,0.342373


In [38]:
rce_like = compute_rce(valid['ypred'].to_array(), valid['like'].to_array())
rce_like

10.787183698752855

In [48]:
pred = valid['ypred'].to_array()
pred = list(map(lambda x: 1 if x > 0.5 else 0,  pred))
valid['ypred'] = pred
ap_like = average_precision_score(valid['ypred'].to_array(), valid['like'].to_array())
ap_like

0.5964026708238265