In [10]:
import os
import sys
import collections
import pandas as pd
import pmdarima as pm 
import matplotlib.pyplot as plt

In [11]:
sys.path.append(r'D:\OneDrive\Programming\documents_python\NUS Courses\DBA5106\Course-DBA5106\group_project\gp3')
from utils.logger import logger
from utils.config import RAW_DATA_DIR, PROCESSED_DATA_DIR, FIGURE_DIR
from utils.data_porter import save_to_csv, read_from_csv

In [12]:
def get_file_path(dataset):
    print(f'\n\nRead from dataset: {dataset}...')
    users_path = f'{dataset}_users_csv_hashed.csv'
    if dataset_name in ['cuba_082020', 'china_052020', 'egypt_022020', 'indonesia_022020', 'thailand_092020']:
        tweets_path_lst = [f'{dataset}_tweets_csv_hashed.csv']
    elif dataset_name == 'russia_052020':
        tweets_path_lst = [f'{dataset}_tweets_csv_hashed_1.csv', f'{dataset}_tweets_csv_hashed_2.csv']
    elif dataset_name == 'serbia_022020':
        tweets_path_lst = [f'{dataset}_tweets_csv_hashed_{"%.2d" % i}.csv' for i in range(1, 14)]
        tweets_path_lst = [os.path.join(f'{dataset}_tweets_csv_hashed', item) for item in tweets_path_lst]
    return users_path, tweets_path_lst

In [13]:
def get_data(users_path, tweets_path):
    print(f'\n\nUsers: {users_path}')
    print(f'Tweets: {tweets_path}')
    tmp_users = read_from_csv(users_path, RAW_DATA_DIR)
    tmp_tweets = read_from_csv(tweets_path, RAW_DATA_DIR, low_memory=False)
    print(f'Number of Users: {len(tmp_users)}')
    print(f'Number of Tweets: {len(tmp_tweets)}')
    tmp_tweets_en = tmp_tweets.loc[tmp_tweets['tweet_language']=='en']
    print(f'Number of English Tweets: {len(tmp_tweets_en)}')
    return tmp_users, tmp_tweets_en

In [14]:
def prepare_dataset(users_path, tweets_path_lst):
    tmp_users, tmp_tweets = get_data(users_path, tweets_path_lst)
    tweets_colume = ['tweetid', '']
#     print(tmp_tweets[:1].T)
#     print(tmp_users[:1].T)
    user_tweets = tmp_tweets.groupby(['userid'])['tweet_text'].apply(lambda x: ' '.join(x)).reset_index()
    user_tweets_num = tmp_tweets[['userid','tweetid']].groupby(['userid']).count().reset_index().rename(columns={'tweetid':'tweet_num'})
    user_tweets = pd.merge(left=user_tweets, right=tmp_users, on='userid')
    user_tweets = pd.merge(left=user_tweets, right=user_tweets_num, on='userid')
    useful_lst = [ 'follower_count', 'following_count', 'account_creation_date', 'tweet_num', 'tweet_text']
    user_tweets = user_tweets[useful_lst]
    user_tweets['state_back'] = 1
#     print(user_tweets.head())
    return user_tweets

In [15]:
dataset_lst = ['cuba_082020', 'china_052020', 'egypt_022020', 'indonesia_022020', 'thailand_092020', 'russia_052020', 'serbia_022020']
# dataset_lst = ['cuba_082020', 'china_052020']
user_tweets_lst = []
for dataset_name in dataset_lst:
    users_path, tweets_path_lst = get_file_path(dataset_name)
    for tweets_path in tweets_path_lst:
        user_tweets = prepare_dataset(users_path, tweets_path)
        user_tweets_lst.append(user_tweets)
user_tweets_final = pd.concat(user_tweets_lst)



Read from dataset: cuba_082020...


Users: cuba_082020_users_csv_hashed.csv
Tweets: cuba_082020_tweets_csv_hashed.csv
Number of Users: 526
Number of Tweets: 4802243
Number of English Tweets: 157831


Read from dataset: china_052020...


Users: china_052020_users_csv_hashed.csv
Tweets: china_052020_tweets_csv_hashed.csv
Number of Users: 23750
Number of Tweets: 348608
Number of English Tweets: 32926


Read from dataset: egypt_022020...


Users: egypt_022020_users_csv_hashed.csv
Tweets: egypt_022020_tweets_csv_hashed.csv
Number of Users: 2541
Number of Tweets: 7935329
Number of English Tweets: 246502


Read from dataset: indonesia_022020...


Users: indonesia_022020_users_csv_hashed.csv
Tweets: indonesia_022020_tweets_csv_hashed.csv
Number of Users: 795
Number of Tweets: 2700296
Number of English Tweets: 555997


Read from dataset: thailand_092020...


Users: thailand_092020_users_csv_hashed.csv
Tweets: thailand_092020_tweets_csv_hashed.csv
Number of Users: 926
Number of Tweets: 21385
N

In [16]:
save_to_csv(user_tweets_final, 'user_tweets_final.csv', PROCESSED_DATA_DIR)

In [20]:
user_tweets_final_filter = user_tweets_final.loc[user_tweets_final['tweet_num'] >= 10]

In [21]:
user_tweets_final_filter

Unnamed: 0,follower_count,following_count,account_creation_date,tweet_num,tweet_text,state_back
0,379,383,2020-01-31,45,"RT @Amelia12610461: #February5, The Cuban cycl...",1
2,535,1144,2019-01-28,9048,@soniagupta504 Don`t worry everithing will be ...,1
3,827,740,2020-01-25,165,RT @DiazCanelB: State terrorism openly declare...,1
5,1434,1307,2019-04-11,374,RT @RafelitoRojo: #USA Among the world's most ...,1
8,9788,3835,2019-02-22,711,EEUU IN 16 YEARS HAS STEALED #VENEZUELA MORE T...,1
...,...,...,...,...,...,...
813,1145,1958,2016-02-03,44,Whitney Houston - I will Always Love You (Laur...,1
814,4274,3998,2015-09-15,845,RT @avucic: Deeply saddened by the terrible ne...,1
815,2169,2362,2016-02-22,1372,RT @MarionSpekker: @huskorkut \nGood morning \...,1
816,970,1097,2015-08-04,96,"RT @avucic: With @PHammondMP, PM Djukanovic, @...",1
