## Filter Twitter data to add new tweets.

This notebook refers to a Google sheet with a list of tweets, and compares with a dump of tweets in json format to find which should be added.



In [1]:
import io
import json
import pandas as pd
from typing import List
from ReadTweetsFromJson import get_tweets_from_json_file

Details of the sheet used for annotating tweets.

In [2]:
KEY = '1qKYUxq_UrUFJy45r5KGuXp6gq-2ZF_VDb6-e7tWagwA'
SHEET_NAME = 'Analysis'

Helper functions for reading the existing tweets in the sheet, and filtering new ones that have not yet been included.

In [3]:
def get_already_processed(key: str, sheet_name: str) -> set:
    '''Returns the IDs of tweets that are already in the analysis.'''
    csv_url = (f'https://docs.google.com/spreadsheets/d/{key}'
               f'/gviz/tq?tqx=out:csv&sheet={sheet_name}')
    df = pd.read_csv(csv_url)
    already_processed = set(df['Tweet ID'])
    return already_processed

def is_interesting(tweet: dict) -> bool:
    '''Applies simple filtering criteria to a tweet.'''
    text = tweet.get('full_text') or tweet.get('text')
    if not text:
        return False
    is_retweet = 'retweeted_status' in tweet
    contains_url = len(tweet['entities']['urls']) > 0
    return not (is_retweet or contains_url)
    
def get_new_tweets(tweets, already_processed) -> pd.DataFrame:
    '''Returns the tweets which are interesting and not yet included.'''
    new_tweets = pd.DataFrame()
    for tweet in tweets:
        text = tweet.get('full_text') or tweet.get('text')
        if is_interesting(tweet) and tweet['id'] not in already_processed:
            new_tweets = new_tweets.append(
                {'text': text,
                 'id': tweet['id'],
                 'time': tweet['created_at']},
                ignore_index=True)
            already_processed.add(tweet['id'])
    new_tweets.id = new_tweets.id.astype(int)
    return new_tweets
            
def get_excluded_tweets(tweets: List[dict]) -> pd.DataFrame:
    '''Returns all tweets that were excluded.'''
    excluded_tweets = pd.DataFrame()
    for tweet in tweets:
        if not is_interesting(tweet):
            excluded_tweets = excluded_tweets.append(
                {'text': tweet.get('full_text') or tweet.get('text'),
                 'id': int(tweet['id']),
                 'time': tweet['created_at']},
                ignore_index=True)  
    excluded_tweets.id = excluded_tweets.id.astype(int)
    return excluded_tweets

Process a dump of tweets and save files with the new tweets to be added, and those which were discarded as not interesting.

In [4]:
JSON_PATH = 'mask_tweets_v3.json'

tweets = get_tweets_from_json_file(JSON_PATH)
already_processed = get_already_processed(KEY, SHEET_NAME)
new_tweets = get_new_tweets(tweets, already_processed)
excluded_tweets = get_excluded_tweets(tweets)
new_tweets.to_csv('new.csv')
excluded_tweets.to_csv('excluded.csv')

In [5]:
new_tweets

Unnamed: 0,id,text,time
0,1264604378926854144,If we continue wearing masks for the next 5 y...,Sun May 24 17:09:13 +0000 2020
1,1264596330594406400,So Am Not Waiting For The Government Masks.\n\...,Sun May 24 16:37:14 +0000 2020
2,1264496902122414080,Proverbs 4:9 She shall give to your head a wre...,Sun May 24 10:02:09 +0000 2020
3,1264481519847997440,Mask by @PichieK84. #COVID19UG #StaySafeUg htt...,Sun May 24 09:01:01 +0000 2020
4,1264342526565265408,"@maureennantume1 says stay safe, keep your dis...",Sat May 23 23:48:43 +0000 2020
...,...,...,...
1259,1264503666075226112,@amiyo90 Finally you are on the bird app,Sun May 24 10:29:01 +0000 2020
1260,1263926887497314304,@Innocentt_T Govt pays its suppliers,Fri May 22 20:17:07 +0000 2020
1261,1263776889275125760,"@SherryKyama Is this necessary, really? https:...",Fri May 22 10:21:04 +0000 2020
1262,1263754332606009344,@rarrigz Man we fail to understand. Masks help...,Fri May 22 08:51:26 +0000 2020
