# Extract tweets for several tweets accounts:

In [1]:
import config
consumer_key= config.consumer_key
consumer_secret= config.consumer_secret
access_token = config.access_token
access_token_secret= config.access_token_secret

In [2]:
# importing libraries:
import tweepy as tw
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
%matplotlib inline

In [3]:
# connecting to twitter API and Creating API object
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)

In [161]:
# Define Supporting fucntions to extract url and hashtags from data:
def url_tweet(url_arg):
    for url in url_arg:
        return url['expanded_url']

def find_urls(text):
    return re.findall(r'https?://.*\b',text)


def hashtag_tweet(entity_arg):
    tags = []
    for tag in entity_arg:
        if tag.get('text') is not None: 
            tags.append(tag.get('text'))
    return tags

def has_media(entity_arg):
    if entity_arg != []:
        return True
    else:
        return False


def is_reply(reply_id):
    if reply_id is not None:
        return True
    else: 
        return False

def is_poll(poll):
    if poll != []:
        return True
    else:
        return False

# Define a fucntion to extract tweets from a twitter account timeline and save them in a Pandas Dataframe:
def tweet(user):
    print(f'retrieving tweets from {user}.... ')
    print('This function may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.')
    tweets = tw.Cursor(api.user_timeline,
                   screen_name=user,
                   lang="en",
                   since="2010-01-20",
                   until='2021-06-01').items(10000)

  
    users_locs = [[tweet.user.id,tweet.created_at,tweet.user.screen_name, tweet.text, find_urls(tweet.text),
               tweet.user.location,tweet.favorite_count,tweet.retweet_count,
               tweet.user.followers_count,tweet.user.friends_count,
               url_tweet(tweet.entities['urls']),hashtag_tweet(tweet.entities['hashtags']),
               has_media(tweet.entities.get('media', [])), tweet.is_quote_status,
               is_reply(tweet.in_reply_to_status_id), is_poll(tweet.entities.get("polls", [])) ] for tweet in tweets]

    tweet_text = pd.DataFrame(data=users_locs, 
                    columns=['ID','created_at', 'screen_name','text','urls', 'location',
                             'favorite_count', 'retweet_count','followers_count','friends_count',
                             'URL','hashtags', 'has_media', 'is_quote', 'is_reply', 'has_poll'])

    print('retrieving tweets has been finished.')
    print(f'{len(tweet_text)} tweets has been retrieved.')
     
    return tweet_text


In [1]:
# list of organizations that the tweets exracted from their timeline:
organizations = organizations_df['twitter_handle'].to_list()
organizations
len(organizations)

46

In [43]:
# Extracting tweets from organizations
print('Runing this code may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.')
print(f'Extracting tweets from {len(organizations)}')
first= True
for organization in organizations:
    if first:
        tweets_df = tweet(organization)
        first = False
    else:
        organization_tweets = tweet(organization)
        tweets_df = pd.concat([tweets_df,organization_tweets])
    

Runing this code may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.
Extracting tweets from 46
retrieving tweets from @volunteerWR.... 
This fucntion may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.
retrieving tweets has been finished.
3215 tweets has been retrieved.
retrieving tweets from @RegionWaterloo.... 
This fucntion may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.
retrieving tweets has been finished.
3239 tweets has been retrieved.
retrieving tweets from @SHORECentreWR.... 
This fucntion may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.
retrieving tweets has been finished.
3193 tweets has been retrieved.
retrieving tweets from @kwartgallery.... 
This fucntion may take several minutes to run.

retrieving tweets has been finished.
3243 tweets has been retrieved.
retrieving tweets from @WorkforceWWD.... 
This fucntion may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.
retrieving tweets has been finished.
1729 tweets has been retrieved.
retrieving tweets from @IdeaXchng.... 
This fucntion may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.
retrieving tweets has been finished.
3233 tweets has been retrieved.
retrieving tweets from @KidSportKW.... 
This fucntion may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.
retrieving tweets has been finished.
1160 tweets has been retrieved.
retrieving tweets from @Kinbridge.... 
This fucntion may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.
retrieving tweet

In [44]:
# add category column:
tweets_df.merge(organizations_df[['Organization Name','screen_name','agency_catergory']], how='left', on = 'screen_name' )

(104638, 11)

In [47]:
tweets_df.reset_index(drop=True, inplace=True)
tweets_df.head(10)

In [49]:
# importing libraries

import nltk
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')  
from nltk.corpus import stopwords  
import re

## Removing retweet indicator

In [None]:
# Removing retweet indicator such as  "RT @WaterlooLibrary":

#Extract retweet from the text and save it in 'retween' column:
tweets_df['retweet']=tweets_df['text'].str.findall(r'RT (@\w+):')

# Removing retweet from the text:
def remove_RT(txt):
    return re.sub(r'RT @\w+:',r'', txt) 
tweets_df['text_clean']=  [remove_RT(tweet) for tweet in  tweets_df['text']]
#tweets_df['text']
tweets_df.loc[0:10]

In [None]:
# Removing all whitesapces from the beginning and the end of the tweets.
# Whitespaces are the characters related to new line, tab, and space.
tweets_df['text_clean'] = [m.strip() for m in tweets_df['text_clean']]
tweets_df.head()

## Removing urls

In [None]:
# Defining a function for removing urls from the text
#def remove_url(txt):
#    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())

def remove_url(text):
    return re.sub(r'https?://.*\b','',text)

tweets_df['text_clean']=tweets_df['text_clean'].apply(lambda x: remove_url(x))

## Removing Emojis

In [None]:
# Removing emojis from the text
# The following function revomes the emojis from text. 'apply' function should be used for each text.
# Apply function: Objects passed to the function are Series objects whose index is either the DataFrame’s index (axis=0) or the DataFrame’s columns (axis=1). 

def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)
tweets_df['text_clean']=tweets_df['text_clean'].apply(lambda x: remove_emoji(x))

#Full list of emojies: https://unicode.org/emoji/charts/full-emoji-list.html

Removing Punctuations
string is a pre-built module in python for workingwith strings

In [None]:
# We use the list of punctuations in the string module for removing the punctuations
import string
string.punctuation

In [None]:
# Defining a supporting function for removing Punctuations
def remove_punc(text):
    return text.translate(text.maketrans('','',string.punctuation))
tweets_df['text_clean']= [remove_punc(x) for x in tweets_df['text_clean']]

In [None]:
# To export the tweets to json:
tweets_df.to_json('tweets_all.json')
