In [2]:
import sys
sys.path.append('..')

import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

from utils.cuda_cluster import *
from utils.dataset import read_data, factorize_small_cardinality
from utils.util import get_day_phase, get_hours_from_midnight_of_current_day

import core.config as conf

Perhaps you already have a cluster running?
Hosting the HTTP server on port 36763 instead
  http_address["port"], self.http_server.port


In [3]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:41465  Dashboard: http://127.0.0.1:36763/status,Cluster  Workers: 1  Cores: 1  Memory: 33.47 GB


## 1. Load data

In [4]:
# data_path = conf.raw_data_path + '*' # for all dadtaset
data_path = conf.raw_data_path + 'part-00175'
ori_df = read_data(data_path)

number of rows: 3033347


In [5]:
total_users = dask_cudf.concat([ori_df["engaged_with_user_id"], ori_df["enaging_user_id"]]).unique().compute()
ori_df, _ = factorize_small_cardinality(ori_df, 'tweet_id')
ori_df, _ = factorize_small_cardinality(ori_df, 'engaged_with_user_id', total_users, True)
ori_df, _ = factorize_small_cardinality(ori_df, 'enaging_user_id', total_users, True)
    

In [6]:
df = ori_df[["tweet_id_encode", "engaged_with_user_id_encode", "enaging_user_id_encode"]]
df['tweet_id_encode'] = df['tweet_id_encode'].astype(np.int32)
df['engaged_with_user_id_encode'] = df['engaged_with_user_id_encode'].astype(np.int32)
df['enaging_user_id_encode'] = df['enaging_user_id_encode'].astype(np.int32)

In [7]:
df.head()

Unnamed: 0_level_0,tweet_id_encode,engaged_with_user_id_encode,enaging_user_id_encode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2214622,2586120,893844
2,2136815,3355961,3173376
3,1986297,1233947,951213
4,35685,3063535,2013995
5,1465500,2280964,3018294


In [8]:
df

Unnamed: 0_level_0,tweet_id_encode,engaged_with_user_id_encode,enaging_user_id_encode
npartitions=16,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,int32,int32,int32
273141,...,...,...
...,...,...,...
2983130,...,...,...
3033347,...,...,...


## 2. Number of unique users, creators and engagers


In [9]:
unique_creators = df['engaged_with_user_id_encode'].unique().compute().to_array()
unique_engagers = df['enaging_user_id_encode'].unique().compute().to_array()
print(unique_creators)
print(unique_engagers)

[      4      10      14 ... 3511081 3511084 3511089]
[      0       1       2 ... 3511086 3511087 3511088]


In [10]:
unique_users = np.unique(np.append(unique_creators, unique_engagers))
print(unique_users)

[      0       1       2 ... 3511087 3511088 3511089]


## 3. Check the number of users that are both creators and engagers in our train set


In [11]:
unique_creator_engager = np.array(list(set(unique_creators) & set(unique_engagers)))
print(unique_creator_engager)

[2621442 1572869 2621449 ... 1572853  524279 2097148]


In [12]:
print(f"Unique creators are: {len(unique_creators):n}")
print(f"Unique engagers are: {len(unique_engagers):n}")
print(f"Unique users are: {len(unique_users):n}")
print(f"Unique users that are both engager and creator: {len(unique_creator_engager):n}")

Unique creators are: 1558866
Unique engagers are: 2175376
Unique users are: 3511090
Unique users that are both engager and creator: 223152


## 4. Count the number of engagements per engager


In [24]:
cols = [
    'reply_timestamp', 
    'retweet_timestamp',
    'retweet_with_comment_timestamp', 
    'like_timestamp'
]
df[cols] = ori_df[cols]

In [25]:
df['is_reply'] = df['reply_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_retweet'] = df['retweet_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_comment'] = df['retweet_with_comment_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)
df['is_like'] = df['like_timestamp'].compute().applymap(lambda x: 1 if x > 0 else 0).astype(np.int32)

df['is_positive'] = df[['is_like', 'is_retweet', 'is_reply', 'is_comment']].sum(axis=1).astype(np.uint8)
df['is_negative'] =  df['is_positive'].compute().applymap(lambda x: 1 if x == 0 else 0).astype(np.int32)

df = df.drop('reply_timestamp', axis=1)
df = df.drop('retweet_timestamp', axis=1)
df = df.drop('retweet_with_comment_timestamp', axis=1)
df = df.drop('like_timestamp', axis=1)


In [29]:
aggregate_result = df.groupby('enaging_user_id_encode').sum().compute()

## 5. How many likes, reply, comment and retweet does an engager have?


In [30]:
aggregate_result

Unnamed: 0_level_0,tweet_id_encode,engaged_with_user_id_encode,is_reply,is_retweet,is_comment,is_like,is_positive,is_negative
enaging_user_id_encode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
698016,198979,1153486,0,0,0,0,0,1
1989231,1353721,2574002,0,0,0,0,0,1
45523,511863,2491617,0,0,0,0,0,1
851357,638951,2296516,0,0,0,1,1,0
3038852,6478313,6880702,0,0,0,2,2,1
...,...,...,...,...,...,...,...,...
895030,1727997,2876389,0,0,0,1,1,0
3308801,2422492,3101224,0,2,0,0,2,0
607002,2802821,1911860,0,0,0,2,2,0
1628293,2650369,7236833,0,0,0,1,1,2
