In [None]:
import sys
sys.path.append('..')

import numpy as np
import matplotlib.pyplot as plt
from scipy import sparse as sps
import pandas as pd

from utils.preprocessing import *
from utils.util import chainer
from utils.target_encode import MTE_one_shot

import pickle
from collections import defaultdict

from tqdm import tqdm
import core.config as conf

## Load data

In [None]:
data_path = conf.raw_lzo_path + 'part-00000'
ori_df = read_data(data_path)
df = ori_df[['hashtags', 'tweet_type', 'language', 'tweet_id', 'reply_timestamp', 'retweet_timestamp', 'comment_timestamp', 'like_timestamp']].copy()
df = df.dropna(subset=['hashtags'])

print('total exampls with hastags: ', len(df['hashtags'].dropna()))

In [None]:
df.head()

In [None]:
df['hashtags_list'] = df['hashtags'].str.split('\t')
df['hashtags_cnt'] = df['hashtags'].str.count('\t')
df['hashtags_cnt'] = df['hashtags_cnt'].astype(int) + 1

print('max hashtag count: ', df['hashtags_cnt'].max())
print('min hashtag count: ', df['hashtags_cnt'].min())

In [None]:
df = pd.DataFrame({
    'id': np.repeat(df.index.to_series(), df['hashtags_cnt']),
    'tweet_type': np.repeat(df['tweet_type'], df['hashtags_cnt']),
    'language': np.repeat(df['language'], df['hashtags_cnt']),
    'retweet_timestamp': np.repeat(df['retweet_timestamp'], df['hashtags_cnt']),
    'comment_timestamp': np.repeat(df['comment_timestamp'], df['hashtags_cnt']),
    'like_timestamp': np.repeat(df['like_timestamp'], df['hashtags_cnt']),
    'reply_timestamp': np.repeat(df['reply_timestamp'], df['hashtags_cnt']),
    'hashtags': chainer(df['hashtags']) # row로 나누기
})


In [None]:
df.head()

In [None]:
len(df)

## Encode hashtags & langauge

In [None]:
# language encode

langauge_df = read_data('../data/language.csv', sep=',', features=['language_id', 'language', 'language_type']).reset_index(drop=True)

language_to_idx = dict(zip(langauge_df['language'], langauge_df['language_id']))
idx_to_language = dict(zip(langauge_df['language_id'], langauge_df['language']))

df['language_encode'] = df['language'].apply(lambda x: language_to_idx[x])

In [None]:
# labeling
for label in (conf.labels):
    label_name = label.split('_')[0]
    df.loc[df[label]<=0, label_name ] = 0
    df.loc[df[label]>0, label_name ] = 1
    df = df.drop([label], axis=1)

In [None]:
# hashtag encode
var = df['hashtags'].fillna('').values.copy()
gc.collect()

PD = {}
null = var[0]
PD[null] = [0,0]
count = 1
for v in var:
    if v not in PD:
        PD[v] = [count,1]
        count +=1
    else:
        x = PD[v]
        x[1] += 1
        PD[v] = x

vari = []
for v in var:
    li=[]
    lf=[]
    if v!='':
        li.append(PD[v][0])
        lf.append(-PD[v][1])
    vari.append( list(np.array(li)[np.argsort(lf)].astype(np.int32) ) )
    
del PD
gc.collect()

len(vari), vari[:10]

In [None]:
df['hashtags_encode'] = np.array( [v[0] for v in vari ] ).astype( np.int32 )

In [None]:
df

In [None]:
print('total rows: ', len(df))
print('total unique hashtags: ', len(df['hashtags'].unique()))

In [None]:
language_cnt = df['language_encode'].value_counts().sort_values()[::-1]

In [None]:
language_cnt

In [None]:
hashtag_cnt = df['hashtags'].value_counts().sort_values()[::-1]

In [None]:
hashtag_cnt

In [None]:
language_types = langauge_df.language_id.to_list()
n_languages = len(language_types)

In [None]:
# Declaring some auxiliary structures
n_engagements_arr = np.zeros(n_languages).astype('int32')
n_positive_engagements_arr = np.empty(n_languages).astype('int32')
n_negative_engagements_arr = np.empty(n_languages).astype('int32')
n_like_engagements_arr = np.empty(n_languages).astype('int32')
n_retweet_engagements_arr = np.empty(n_languages).astype('int32')
n_reply_engagements_arr = np.empty(n_languages) .astype('int32')
n_comment_engagements_arr = np.empty(n_languages).astype('int32')

In [None]:
aggregate_result = df.groupby(['language_encode', 'hashtags']).sum()
aggregate_result.head()

In [None]:
aggregate_result = df.groupby(['language_encode', 'hashtags']).count()
aggregate_result = aggregate_result[["id"]]
aggregate_result['cnt'] = aggregate_result['id'] 
aggregate_result = aggregate_result.drop('id', axis=1)
aggregate_result = aggregate_result.reset_index()


In [None]:
aggregate_result

In [None]:
aggregate_result.head()

In [None]:
len(aggregate_result)

In [None]:
top_n = 100
language_hashtags = [[] for _ in range(n_languages)]
for i in range(n_languages):
    try:
        tmp = aggregate_result.loc[aggregate_result['language_encode'] == i]
        tmp = tmp.sort_values('cnt', ascending=False)[:top_n]

        language_hashtags[i] = tmp
    except:
        pass

In [None]:
language_hashtags[0].head()

In [None]:
language_hashtags[0]['cnt']

In [None]:
for i in range(n_languages)[:5]:
    plt.bar(range(len(language_hashtags[i])), language_hashtags[i]['cnt'])
    plt.show()

In [None]:
for i in range(n_languages):
    if len(language_hashtags[i]['cnt']) == 0:
        print(f"Language {i} - max tag count: 0 ")
    else:
        max_cnt = language_hashtags[i]['cnt'].max()
        print(f"Language {i} - max tag count: {int(max_cnt):n} ")

## Anlaysis hashtag count for all data

In [None]:
# language encoder
language_df = read_data('../data/language.csv', sep=',', features=['language_id', 'language', 'language_type']).reset_index(drop=True)

language_to_idx = dict(zip(language_df['language'], language_df['language_id']))
idx_to_language = dict(zip(language_df['language_id'], language_df['language']))

language_types = language_df.language_id.to_list()
n_languages = len(language_types)

In [None]:
conf.labels

In [49]:
# 언어별로 hashtag 히스토그램 만들기!
file_list = sorted(os.listdir(conf.raw_lzo_path))

for file_name in tqdm(file_list):
    data_path = conf.raw_lzo_path + file_name
    ori_df = read_data(data_path)
    df = ori_df[['hashtags', 'tweet_type', 'language', 'tweet_id', 'reply_timestamp', 'retweet_timestamp', 'comment_timestamp', 'like_timestamp']].copy()
    df = df.dropna(subset=['hashtags'])
    print('total exampls with hastags: ', file_name, ' : ', len(df['hashtags'].dropna()))

    df['hashtags_list'] = df['hashtags'].str.split('\t')
    df['hashtags_cnt'] = df['hashtags'].str.count('\t')
    df['hashtags_cnt'] = df['hashtags_cnt'].astype(int) + 1

    df = pd.DataFrame({
        'id': np.repeat(df.index.to_series(), df['hashtags_cnt']),
        'tweet_type': np.repeat(df['tweet_type'], df['hashtags_cnt']),
        'language': np.repeat(df['language'], df['hashtags_cnt']),
        'retweet_timestamp': np.repeat(df['retweet_timestamp'], df['hashtags_cnt']),
        'comment_timestamp': np.repeat(df['comment_timestamp'], df['hashtags_cnt']),
        'like_timestamp': np.repeat(df['like_timestamp'], df['hashtags_cnt']),
        'reply_timestamp': np.repeat(df['reply_timestamp'], df['hashtags_cnt']),
        'hashtags': chainer(df['hashtags']) # row로 나누기
    })

    # language encode
    df['language_encode'] = df['language'].apply(lambda x: language_to_idx[x])

    # labeling
    for label in (conf.labels):
        label_name = label.split('_')[0]
        df.loc[df[label]<=0, label_name ] = 0
        df.loc[df[label]>0, label_name ] = 1
        df = df.drop([label], axis=1)

    
    aggregate_result = df.groupby(['language_encode', 'hashtags']).count()
    # aggregate_result = aggregate_result[["id"]]
    aggregate_result['cnt'] = aggregate_result['id'] 
    aggregate_result = aggregate_result.drop('id', axis=1)
    aggregate_result = aggregate_result.reset_index()

    top_n = 100
    language_hashtags = [[] for _ in range(n_languages)]
    for i in range(n_languages):
        # try:
            tmp = aggregate_result.loc[aggregate_result['language_encode'] == i]
            tmp = tmp.sort_values('cnt', ascending=False)[:top_n]

            language_hashtags[i] = tmp

            data = defaultdict()
            with open(f'../data/hashtag/_hashtag_dict{i}.pickle', 'rb') as f:
                try:
                    data = pickle.load(f)
                except:
                    # print('no data in pickle')
                    pass
                    

            with open(f'../data/hashtag/_hashtag_dict{i}.pickle', 'wb') as f:
                
                cnt_data = dict(zip(language_hashtags[i]['hashtags'], language_hashtags[i]['cnt']))
                reply_data = dict(zip(language_hashtags[i]['hashtags'], language_hashtags[i]['reply']))
                retweet_data = dict(zip(language_hashtags[i]['hashtags'], language_hashtags[i]['retweet']))
                comment_data = dict(zip(language_hashtags[i]['hashtags'], language_hashtags[i]['comment']))
                like_data = dict(zip(language_hashtags[i]['hashtags'], language_hashtags[i]['like']))

                for key, value in cnt_data.items():
                    if key in data.keys():
                        data[key][0] += value # cnt
                        data[key][1] += reply_data[key] # reply
                        data[key][2] += retweet_data[key] # retweet
                        data[key][3] += comment_data[key] # comment
                        data[key][4] += like_data[key] # like
                    else:
                        data[key] = [value, reply_data[key], retweet_data[key], comment_data[key], like_data[key]]
                
                pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
                # print(len(data))
                # print(data)
                del data
                del cnt_data

        # except:
        #     print(f'error')
        #     pass
    


    save_memory(df)
    del df
    del language_hashtags
    del aggregate_result
    


/it]total exampls with hastags:  part-00094  :  596527
 33%|███▎      | 95/291 [33:12<1:10:34, 21.61s/it]total exampls with hastags:  part-00095  :  596073
 33%|███▎      | 96/291 [33:34<1:10:10, 21.59s/it]total exampls with hastags:  part-00096  :  598176
 33%|███▎      | 97/291 [33:56<1:09:49, 21.60s/it]total exampls with hastags:  part-00097  :  597368
 34%|███▎      | 98/291 [34:17<1:09:25, 21.58s/it]total exampls with hastags:  part-00098  :  596011
 34%|███▍      | 99/291 [34:38<1:08:34, 21.43s/it]total exampls with hastags:  part-00099  :  596563
 34%|███▍      | 100/291 [34:59<1:08:04, 21.38s/it]total exampls with hastags:  part-00100  :  596537
 35%|███▍      | 101/291 [35:21<1:07:24, 21.29s/it]total exampls with hastags:  part-00101  :  595904
 35%|███▌      | 102/291 [35:42<1:06:50, 21.22s/it]total exampls with hastags:  part-00102  :  597013
 35%|███▌      | 103/291 [36:03<1:06:23, 21.19s/it]total exampls with hastags:  part-00103  :  598051
 36%|███▌      | 104/291 [36:24<

In [50]:
# load dictionary
hashtag_dict = [dict() for _ in range(n_languages)]
for i in range(n_languages):
    with open(f'../data/hashtag/hashtag_dict{i}.pickle', 'rb') as f:
        try:
            hashtag_dict[i] = pickle.load(f)
            print(i, len(hashtag_dict[i]))
        except:
            print('no_data')
            pass

0 147
1 150
2 216
3 225
4 230
5 181
6 183
7 246
8 154
9 351
10 533
11 227
12 1712
13 340
14 306
15 380
16 499
17 1177
18 552
19 1233
20 1857
21 668
22 638
23 1308
24 2706
25 1218
26 4736
27 5037
28 3710
29 2004
30 2844
31 2923
32 2090
33 5056
34 5169
35 5248
36 5117
37 6274
38 2256
39 6144
40 3346
41 1296
42 5849
43 6131
44 2923
45 6307
46 6125
47 3927
48 5372
49 3743
50 5005
51 3032
52 2375
53 3408
54 2352
55 1715
56 4253
57 1572
58 1695
59 456
60 245
61 107
62 312
63 127
64 41
65 27


In [52]:
hashtag_dict[28]['66C64D2A3D611967328A190606C8DCEF']

[1114, 70, 122, 25, 893]

In [71]:
print(list(hashtag_dict[28].values())[:10])
np.array(list(hashtag_dict[28].values())[:10])[:,1]

[[321, 16, 16, 1, 203], [490, 18, 12, 3, 126], [1114, 70, 122, 25, 893], [350, 37, 21, 9, 69], [145, 3, 3, 0, 33], [24, 0, 0, 0, 24], [13, 0, 0, 0, 1], [56, 1, 0, 0, 27], [28, 2, 0, 0, 11], [11, 0, 0, 0, 1]]


array([16, 18, 70, 37,  3,  0,  0,  1,  2,  0])

In [None]:
language_hashtags

In [72]:
# target encoding for top_n hashtags with language

target_encoder = pd.DataFrame()
for i in range(n_languages):
    values = np.array(list(hashtag_dict[i].values()))
    cnt = values[:,0]
    reply = values[:,1]
    retweet = values[:,2]
    comment = values[:,3]
    like = values[:,4]
    target_encoder =  pd.concat([target_encoder, pd.DataFrame({'language': np.repeat(idx_to_language[i], len(hashtag_dict[i])), 'hashtags': hashtag_dict[i].keys(), 'cnt': cnt, 'reply': reply, 'retweet': retweet, 'comment': comment, 'like': like})]) 

target_encoder = target_encoder.reset_index(drop=True)

In [73]:
target_encoder

Unnamed: 0,language,hashtags,cnt,reply,retweet,comment,like
0,488B32D24BD4BB44172EB981C1BCA6FA,43B37225C841C6DB6E7D340EAFBA569C,671805,4086,66190,5211,210659
1,488B32D24BD4BB44172EB981C1BCA6FA,A4C4EE3FDED70EFFF280A3C748368825,544271,9093,37163,4015,101807
2,488B32D24BD4BB44172EB981C1BCA6FA,F41EAA37F8AEF033C6664B393B1060B1,452777,2980,43205,4015,129950
3,488B32D24BD4BB44172EB981C1BCA6FA,D9C7E10458181B70421C50A99AB87B78,404188,11214,21640,2987,135618
4,488B32D24BD4BB44172EB981C1BCA6FA,F47F8AD749C074B81AC14ABBE458EAA6,322776,4909,16528,2315,178016
...,...,...,...,...,...,...,...
147781,8C64085F46CD49FA5C80E72A35845185,3CE892BA90B28C0BA6D7BEBC934C26CD,1,0,0,0,1
147782,8C64085F46CD49FA5C80E72A35845185,CBB1D5E37E5E8F4F3EA7A840484C5C64,1,0,0,0,1
147783,8C64085F46CD49FA5C80E72A35845185,603B6445385DB6098E70886DEBC9FFB1,1,0,0,0,0
147784,8C64085F46CD49FA5C80E72A35845185,6668685BAC46105378AC3B272C4E6977,1,0,0,0,0


In [None]:
target_encoder.head(10)

In [None]:
encoder = MTE_one_shot(folds=5,smooth=20)

In [None]:
# target encoding

for file_name in tqdm(file_list[:1]):
    data_path = conf.raw_lzo_path + file_name
    df = read_data(data_path)
    df = df.drop('text_tokens', axis=1)    
    df = feature_extraction(df, features=conf.used_features + ['hashtags'], train=True) 

    c = ['language', 'hashtags']
    target = 'like'
    out_col = 'TE_'+'_'.join(c)+'_'+target

    df = encoder.fit_transform(df, c, target, out_col=out_col, out_dtype='float32')



In [None]:
df.head(30)

In [None]:
data_path = conf.raw_lzo_path + 'part-00000'

df = read_data(data_path)
df = df.groupby(['language', 'hashtags']).count()

In [None]:
df = df.reset_index()