In [3]:
import sys
sys.path.append('..')

from utils.cuda_cluster import *
from utils.dataset import *
from utils.util import save_memory
import dask as dask, dask_cudf

from tqdm import tqdm

import core.config as conf

In [4]:

data_path = conf.raw_data_path + 'part-00175'
df = read_data(data_path, n_partitions=conf.n_partitions)
df.columns = conf.raw_features + conf.labels

df = df.drop('text_tokens', axis=1)

df, = dask.persist(df)
_ = wait(df)

In [5]:
df.columns

Index(['hashtags', 'tweet_id', 'media', 'links', 'domains', 'tweet_type',
       'language', 'tweet_timestamp', 'creator_id', 'creator_follower_count',
       'creator_following_count', 'creator_is_verified',
       'creator_account_creation', 'engager_id', 'enaging_user_follower_count',
       'engager_following_count', 'engager_is_verified',
       'engager_account_creation', 'creator_follows_engager',
       'reply_timestamp', 'retweet_timestamp',
       'retweet_with_comment_timestamp', 'like_timestamp'],
      dtype='object')

In [6]:
features = ['creator_id', 'engager_id', 'tweet_id', 'tweet_type', 'language', 'creator_follower_count', 'creator_following_count', 'domains', 'media', 'tweet_timestamp']
df = feature_extraction(df, features=features, labels=conf.labels)

100%|██████████| 4/4 [00:05<00:00,  1.31s/it]
 47%|████▋     | 7/15 [00:00<00:00, 69739.02it/s]
100%|██████████| 3/3 [00:11<00:00,  3.78s/it]
100%|██████████| 3/3 [00:00<00:00,  4.02it/s]


In [7]:
df.head()

Unnamed: 0,creator_id,engager_id,creator_follower_count,creator_following_count,tweet_timestamp,reply,retweet,like,dt_day,dt_dow,dt_hour,language,tweet_type,media,id,domains
0,51AA232EC3465EF97AD10F86F4D0FDAF,E080FA54ADDDE2AD60D16732B6A21873,1175,927,1613997245,0,0,0,22,0,12,61,1,5,6177,0
1,B827BA8BEE7FC11C2947C32DB34B0166,F9D10F33314377A87D26ADD2F6C9BDE6,372,585,1612589286,0,0,1,6,5,5,19,1,0,6193,0
2,C8C9007AF0441782FCFD7B3F1AA3A0B9,0FDAE3795B5418C9FE09B7B7E007D2AF,306,210,1612630944,0,0,1,6,5,17,19,1,7,6185,0
3,6C0AC44474BD6DF4ACC29F651D2DC85A,660FD10143BBE8D295B5621AE275746B,72,122,1613042890,0,0,1,11,3,11,19,1,10,6201,0
4,3725049251C78314DCBC2DA6A5030A1D,0F07542C9C78CA86DCCC4EF0BA16A791,3444,982,1612891741,0,0,0,9,1,17,19,1,0,6178,0


In [8]:
t = 'like'
df = df.compute().to_pandas() # to pandas
for c in tqdm([
    ['engager_id'],
    ['engager_id','tweet_type','language'],
    ['creator_id'],
    ['domains','media','tweet_type','language']
    ]):
    fname = 'TE_'+'_'.join(c)+'_'+t
    print( fname )
    df[fname] = tartget_encoding( df, c, t, 20, 0 )
df = cudf.from_pandas(df)
df = dask_cudf.from_cudf(df,  npartitions=conf.n_partitions).reset_index().drop('index', axis=1)

  0%|          | 0/4 [00:00<?, ?it/s]TE_engager_id_like
 25%|██▌       | 1/4 [00:27<01:21, 27.27s/it]TE_engager_id_tweet_type_language_like
 50%|█████     | 2/4 [01:21<01:25, 43.00s/it]TE_creator_id_like
 75%|███████▌  | 3/4 [01:40<00:32, 32.28s/it]TE_domains_media_tweet_type_language_like
100%|██████████| 4/4 [01:45<00:00, 26.29s/it]


In [35]:
file_path = f'{conf.preproc_path}/train/' + 'part-00175.parquet'
save_parquet(df, file_path)

In [9]:
# del df 

# client.close()
# cluster.close()

In [36]:
df.head()

Unnamed: 0,creator_id,engager_id,creator_follower_count,creator_following_count,tweet_timestamp,reply,retweet,like,dt_day,dt_dow,dt_hour,language,tweet_type,media,id,domains,TE_engager_id_like,TE_engager_id_tweet_type_language_like,TE_creator_id_like,TE_domains_media_tweet_type_language_like
0,51AA232EC3465EF97AD10F86F4D0FDAF,E080FA54ADDDE2AD60D16732B6A21873,1175,927,1613997245,0,0,0,22,0,12,61,1,5,6177,0,,,,0.349328
1,58DC3AFB7649EDED0EB2A710CE85924C,0C83D6251B3ED28945F1194E6E359DBC,10692,8394,1614082017,0,0,0,23,1,12,61,2,0,287221,0,,,,0.454765
2,E1386586D16DC85D18BBBBE2D966F014,DC03F3C543AC389D50C4254BA2CB10DF,67489,155,1612988313,0,0,0,10,2,20,7,2,4,550281,0,,,,0.589558
3,275F79705BFB784B7D626677F1966EED,2D41203A7D3A80F093B2EAA879B2E773,11654,136,1613404808,0,0,1,15,0,16,19,2,4,830432,0,,,,0.546565
4,A80C76361A379BC3CCF9E81CCEC21A08,0AAE6B93BB827DBC2352E7A1A0600A12,1109,1206,1612684449,1,0,0,7,6,7,46,2,1,1098136,0,,,,0.493198
