# Analysis - Hashtag

In [1]:
import sys
sys.path.append('..')

import numpy as np
import matplotlib.pyplot as plt
from scipy import sparse as sps
import pandas as pd


from utils.cuda_cluster import *
from utils.dataset import read_data, factorize_small_cardinality

import core.config as conf


In [2]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:41421  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 1  Memory: 33.47 GB


## 1. Load data

In [3]:
# data_path = conf.raw_data_path + '*' # for all dataset
data_path = conf.raw_data_path + 'part-00175'
ori_df = read_data(data_path)

number of rows: 3033347


In [4]:
ori_df.columns

Index(['text_ tokens', 'hashtags', 'tweet_id', 'present_media',
       'present_links', 'present_domains', 'tweet_type', 'language',
       'tweet_timestamp', 'engaged_with_user_id',
       'engaged_with_user_follower_count', 'engaged_with_user_following_count',
       'engaged_with_user_is_verified', 'engaged_with_user_account_creation',
       'enaging_user_id', 'enaging_user_follower_count',
       'enaging_user_following_count', 'enaging_user_is_verified',
       'enaging_user_account_creation', 'engagee_follows_engager',
       'reply_timestamp', 'retweet_timestamp',
       'retweet_with_comment_timestamp', 'like_timestamp'],
      dtype='object')

In [5]:
df = ori_df[['engaged_with_user_id', 'engaged_with_user_follower_count', 'engaged_with_user_following_count', 'enaging_user_id', 'enaging_user_follower_count', 'enaging_user_following_count', 'reply_timestamp', 'retweet_timestamp','retweet_with_comment_timestamp', 'like_timestamp']]

In [6]:
df['is_reply'] = df['reply_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_retweet'] = df['retweet_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_comment'] = df['retweet_with_comment_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_like'] = df['like_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)

df['is_positive'] = df['is_reply'] | df['is_retweet'] | df['is_comment'] | df['is_like']
df['is_negative'] =  df['is_positive'].compute().applymap(lambda x: 1 if x == 0 else 0).astype(np.int32)

df = df.drop('reply_timestamp', axis=1)
df = df.drop('retweet_timestamp', axis=1)
df = df.drop('retweet_with_comment_timestamp', axis=1)
df = df.drop('like_timestamp', axis=1)

In [13]:
total_users = dask_cudf.concat([ori_df["engaged_with_user_id"], ori_df["enaging_user_id"]]).unique().compute()
df, _ = factorize_small_cardinality(df, 'engaged_with_user_id', total_users, True)
df, _ = factorize_small_cardinality(df, 'enaging_user_id', total_users, True)

total_users

0          000000B946F21610D8169031348D6748
1          0000030E0DCCFDF9DBF2DDC031E6DA58
2          0000059852AB4CDFB4C417C550780C7D
3          000005BCF00DCCEABCF7F82BDCFB3543
4          0000079F3828E08D0A604E3E0D78C91F
                         ...               
3511085    FFFFF3D18C31109B40C431CE050BEE2E
3511086    FFFFF50BA9D4FF3225576C8996BE8BAC
3511087    FFFFF7DE6B3CB775180049CB3CD337BA
3511088    FFFFF8D0A1E9C84BD356965A0C74ADB8
3511089    FFFFFA8F28C6657BA57B88D608FA2298
Length: 3511090, dtype: object

In [16]:
df = df.rename(columns={'engaged_with_user_id_encode': 'careator_id'})
df = df.rename(columns={'enaging_user_id_encode': 'engager_id'})
df = df.drop('engaged_with_user_id', axis=1)
df = df.drop('enaging_user_id', axis=1)

df = df.rename(columns={'engaged_with_user_follower_count': 'creator_follower_count'})
df = df.rename(columns={'engaged_with_user_following_count': 'creator_following_count'})
df = df.rename(columns={'enaging_user_follower_count': 'engager_follower_count'})
df = df.rename(columns={'enaging_user_following_count': 'engager_following_count'})

In [19]:
df.head()

Unnamed: 0_level_0,creator_follower_count,creator_following_count,engager_follower_count,engager_following_count,is_reply,is_retweet,is_comment,is_like,is_positive,is_negative,careator_id,engager_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,4753,1060,76,189,0,0,0,1,1,0,2586120,893844
2,110643,180,260,379,0,0,0,0,0,1,3355961,3173376
3,4480,2953,1259,868,1,0,0,0,1,0,1233947,951213
4,461,157,437,597,0,0,0,1,1,0,3063535,2013995
5,1308,1478,247,404,0,0,0,0,0,1,2280964,3018294
