In [44]:
import io
import os
import json
import pandas as pd
import numpy as np
from ReadTweetsFromJson import get_tweets_from_json_file
from dotenv import load_dotenv

### Tweets from "Analysis" tab

In [29]:
load_dotenv()

SPREADSHEET_KEY = os.getenv('SPREAD_SHEET_KEY')
SHEET_NAME = 'Analysis'

In [7]:
def get_data_from_spreadsheet(key: str, sheet_name: str) -> pd.DataFrame:
    csv_url = (f'https://docs.google.com/spreadsheets/d/{key}'
               f'/gviz/tq?tqx=out:csv&sheet={sheet_name}')
    return pd.read_csv(csv_url)

In [8]:
tweets_df = get_data_from_spreadsheet(SPREADSHEET_KEY, SHEET_NAME)
tweets_df.head()

Unnamed: 0,Tweet ID,Time,Tweet,Use of Masks,Implementation,Political concerns,Notes,Unnamed: 7,Unnamed: 8,Key,...,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27
0,1263463831554195456,Thu May 21 13:37:06 +0000 2020,Minister Aceng: You will be denied entry in a ...,0,0,0.0,,,,Use of Masks,...,,,,,,,,,,
1,1263178258700546048,Wed May 20 18:42:20 +0000 2020,@JaneRuth_Aceng Who supplied you with that Dee...,0,0,0.0,,,,Implementation,...,,,,,,,,,,
2,1263091068406546432,Wed May 20 12:55:52 +0000 2020,"Uganda will manufacture 800,000 masks in a day...",0,0,0.0,,,,Political concerns,...,,,,,,,,,,
3,1262815097447157762,Tue May 19 18:39:15 +0000 2020,M7 removed his mask to sip his chai in peace 😂...,0,0,0.0,,,,,...,,,,,,,,,,
4,1262812714860249091,Tue May 19 18:29:47 +0000 2020,How we need a week to organise to drive in pri...,1,1,0.0,,,,,...,,,,,,,,,,


In [27]:
cols = ['Tweet ID', 'Time', 'Tweet', 'Use of Masks', 'Implementation ', 'Political concerns', 'Notes']
tweets_df = tweets_df[cols]
tweets_df['Political concerns'] = tweets_df['Political concerns'].apply(lambda x: 0 if np.isnan(x) else np.int64(x))
tweets_df.head()

Unnamed: 0,Tweet ID,Time,Tweet,Use of Masks,Implementation,Political concerns,Notes
0,1263463831554195456,Thu May 21 13:37:06 +0000 2020,Minister Aceng: You will be denied entry in a ...,0,0,0,
1,1263178258700546048,Wed May 20 18:42:20 +0000 2020,@JaneRuth_Aceng Who supplied you with that Dee...,0,0,0,
2,1263091068406546432,Wed May 20 12:55:52 +0000 2020,"Uganda will manufacture 800,000 masks in a day...",0,0,0,
3,1262815097447157762,Tue May 19 18:39:15 +0000 2020,M7 removed his mask to sip his chai in peace 😂...,0,0,0,
4,1262812714860249091,Tue May 19 18:29:47 +0000 2020,How we need a week to organise to drive in pri...,1,1,0,


### Tweets from json files

In [33]:
def is_interesting(tweet: dict) -> bool:
    '''Applies simple filtering criteria to a tweet.'''
    text = tweet.get('full_text') or tweet.get('text')
    if not text:
        return False
    is_retweet = 'retweeted_status' in tweet
    contains_url = len(tweet['entities']['urls']) > 0
    return not (is_retweet or contains_url)

def get_interesting_tweets(tweets):
    return list(filter(lambda tweet: is_interesting(tweet), tweets))

Fetch data from json files and find out how many unique tweets are there (both by id and by text)

In [63]:
json_tweets = get_tweets_from_json_file('mask_tweetsv3.json')
json_tweets += get_tweets_from_json_file('mask_tweets_v4.json')
print("Total number of tweets: ", len(json_tweets))

# Filter intersting tweets
json_tweets = get_interesting_tweets(json_tweets)
print("Number of interesting tweets: ", len(json_tweets))

# Find duplicates
unique_ids = set([tweet['id_str'] for tweet in json_tweets])
unique_text = set([tweet.get('full_text') or tweet.get('text') for tweet in json_tweets])


print("Unique tweets by id: ", len(unique_ids))
print("Unique tweets by text: ", len(unique_text))

Total number of tweets:  4047
Number of interesting tweets:  1371
Unique tweets by id:  1371
Unique tweets by text:  1361


Which tweets have same text but different IDs?

In [93]:
text_dict = dict()
for tweet in json_tweets:
    text = tweet.get('text') or tweet.get('full_text')
    if text not in text_dict:
        text_dict[text] = []
    text_dict[text].append({'id': tweet['id_str'], 'text': text, 'time': tweet['created_at'], 'user': tweet['user']['screen_name']})

# Uncomment the following lines to see the types of tweets that have similar text.

# for text in text_dict.keys():
#     if len(text_dict[text]) > 1:
#         print("\n{}\n".format(json.dumps(text_dict[text], indent=3)))

From above, it looks like the tweets with the same text were posted at different times and sometimes by different users. 

### Combining data from 'Analysis' tab with that from the json files

First, filter out tweets with the same text, and those already analyzed.

In [67]:
print("Before filtering: ", len(json_tweets))

# Filter out tweets with same text.
def remove_duplicate_text_tweets(tweets):
    final_tweets = []
    unique_texts = set()
    for tweet in tweets:
        text = tweet.get('text') or tweet.get('full_text')
        if text not in unique_texts:
            final_tweets.append(tweet)
            unique_texts.add(text)
    return final_tweets

# Remove tweets already in the tweets_df
def remove_already_analyzed_tweets(tweets):
    final_tweets = []
    already_text = set()
    already_id = set()
    for index, row in tweets_df.iterrows():
        already_text.add(row['Tweet'])
        already_id.add(str(row['Tweet ID']))
    for tweet in tweets:
        text = tweet.get('text') or tweet.get('full_text')
        if text not in already_text and tweet['id_str'] not in already_id:
            final_tweets.append(tweet)
    return final_tweets

json_tweets = remove_duplicate_text_tweets(json_tweets)
json_tweets = remove_already_analyzed_tweets(json_tweets)
print("After filtering: ", len(json_tweets))

Before filtering:  1361
After filtering:  994


Add the remaining tweets to the data frame.

In [86]:
combined_tweets_df = pd.DataFrame()
combined_tweets_df = combined_tweets_df.append(tweets_df, ignore_index=True)

for tweet in json_tweets:
    text = tweet.get('text') or tweet.get('full_text')
    combined_tweets_df = combined_tweets_df.append({
        'Tweet ID': str(tweet['id_str']), 
        'Time': tweet['created_at'], 
        'Tweet': text, 
        'Use of Masks': "", 
        'Implementation ': "", 
        'Political concerns': "", 
        'Notes': ""
    }, ignore_index=True)
# tweets_df['Tweet ID'].astype('int64')

In [89]:
combined_tweets_df

Unnamed: 0,Tweet ID,Time,Tweet,Use of Masks,Implementation,Political concerns,Notes
0,1263463831554195456,Thu May 21 13:37:06 +0000 2020,Minister Aceng: You will be denied entry in a ...,0,0,0,
1,1263178258700546048,Wed May 20 18:42:20 +0000 2020,@JaneRuth_Aceng Who supplied you with that Dee...,0,0,0,
2,1263091068406546432,Wed May 20 12:55:52 +0000 2020,"Uganda will manufacture 800,000 masks in a day...",0,0,0,
3,1262815097447157762,Tue May 19 18:39:15 +0000 2020,M7 removed his mask to sip his chai in peace 😂...,0,0,0,
4,1262812714860249091,Tue May 19 18:29:47 +0000 2020,How we need a week to organise to drive in pri...,1,1,0,
...,...,...,...,...,...,...,...
1383,1265266482797215746,Tue May 26 13:00:11 +0000 2020,"“No mask, No Entry” at Kabale Regional Referra...",,,,
1384,1265263818898313221,Tue May 26 12:49:36 +0000 2020,Face Masks - The Don’ts \n\n-Do NOT pick masks...,,,,
1385,1265262140149153793,Tue May 26 12:42:56 +0000 2020,"Stay healthy, stay positive and keep going but...",,,,
1386,1265244250880557056,Tue May 26 11:31:51 +0000 2020,@MinofHealthUG issued guidelines for the use o...,,,,


In [90]:
combined_tweets_df.describe()

Unnamed: 0,Tweet ID,Time,Tweet,Use of Masks,Implementation,Political concerns,Notes
count,1387,1387,1388,1388.0,1388.0,1388.0,1043.0
unique,1387,1282,1384,4.0,4.0,4.0,4.0
top,1262327690486579200,Mon May 18 19:14:21 +0000 2020,"Guys,they said the masks will be READY in 14 d...",,,,
freq,1,4,2,994.0,994.0,994.0,994.0


Create the csv files

In [91]:
combined_tweets_df.to_csv('combined.csv', index=False)