# Extract tweets for several tweets accounts:

In [1]:
import config
consumer_key= config.consumer_key
consumer_secret= config.consumer_secret
access_token = config.access_token
access_token_secret= config.access_token_secret

In [2]:
# importing libraries:
import tweepy as tw
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
%matplotlib inline

In [3]:
# connecting to twitter API and Creating API object
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)

In [161]:
# Define Supporting fucntions to extract url and hashtags from data:
def url_tweet(url_arg):
    for url in url_arg:
        return url['expanded_url']

def find_urls(text):
    return re.findall(r'https?://.*\b',text)


def hashtag_tweet(entity_arg):
    tags = []
    for tag in entity_arg:
        if tag.get('text') is not None: 
            tags.append(tag.get('text'))
    return tags

# Define a fucntion to extract tweets from a twitter account timeline and save them in a Pandas Dataframe:
def tweet(user):
    print(f'retrieving tweets from {user}.... ')
    print('This fucntion may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.')
    tweets = tw.Cursor(api.user_timeline,
                   screen_name=user,
                   lang="en",
                   since="2010-01-20",
                   until='2021-06-01').items(10000)

  
    users_locs = [[tweet.user.id,tweet.created_at,tweet.user.screen_name, tweet.text, find_urls(tweet.text),
               tweet.user.location,tweet.favorite_count,tweet.retweet_count,
               tweet.user.followers_count,tweet.user.friends_count,
               url_tweet(tweet.entities['urls']),hashtag_tweet(tweet.entities['hashtags']) ] for tweet in tweets]

    tweet_text = pd.DataFrame(data=users_locs, 
                    columns=['ID','created_at', 'screen_name','text','urls', 'location',
                             'favorite_count', 'retweet_count','followers_count','friends_count',
                             'URL','hashtags'])

    print('retrieving tweets has been finished.')
    print(f'{len(tweet_text)} tweets has been retrieved.')
     
    return tweet_text


In [162]:
organizations = [
'@volunteerWR']
# Extracting tweets from organizations
print('Runing this code may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.')
print(f'Extracting tweets from {len(organizations)}')
first= True
for organization in organizations:
    if first:
        tweets_df = tweet(organization)
        first = False
    else:
        organization_tweets = tweet(organization)
        tweets_df = pd.concat([tweets_df,organization_tweets])


Runing this code may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.
Extracting tweets from 1
retrieving tweets from @volunteerWR.... 
This fucntion may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.
retrieving tweets has been finished.
100 tweets has been retrieved.


In [157]:

def find_url(text):
    return re.findall(r'https?://.*\b',text)

tweets_df['urls']=tweets_df['text'].apply(lambda x: find_url(x))

In [163]:
tweets_df['urls'].iloc[5:25]

5                             [https://t.co/OymJZTJo9V]
6                             [https://t.co/WWl6jR26Id]
7                                                    []
8                             [https://t.co/kAPCrispWw]
9                                                    []
10                                                   []
11                                                   []
12    [https://t.co/1vwvAo0t2z https://t.co/EAHihVqf2T]
13                                                   []
14                            [https://t.co/pNOEYBojy9]
15    [https://t.co/ynFZsg8aKN https://t.co/9chWLPm2Ml]
16                                                   []
17                                                   []
18                                                   []
19                                                   []
20                            [https://t.co/bAkZoA1h0J]
21                            [https://t.co/jux443BbaR]
22                            [https://t.co/ayzB

In [153]:
for text in tweets_df.text:
    print(text)

RT @Ian_Mosby: .@erinmillions &amp; I didn't write this headline but I suspect that it reflected the gut reaction of the American editors after…
RT @cityofcambridge: Tomorrow night at 7 pm we kick off Summer Nights Live for the kids with Sonshine &amp; Broccoli. Enjoy live music every We…
RT @Kinbridge: We are excited to add a second location for our summer camps. Sign up for our outdoor adventures camp at Cedar Hill United C…
RT @FdnBlkComm: This #EmancipationDay, we remember and honour our ancestors whose labour, struggle, and tenacity resulted in the abolition…
RT @TREEducation: 💥 NEW opportunity this fall for youth!

We're looking for youth passionate about social justice, conflict resolution, and…
RT @LeadingELLs: Wrote staff an email and then decided I should have made it a graphic. Names matter. Share away. https://t.co/OymJZTJo9V
Happy Tuesday All!
We are coming at you with this new research opportunity. Check it out! https://t.co/WWl6jR26Id
RT @WilmotFamilyRC: ADDED DATES: FRI

In [128]:
user = '@volunteerWR'
tweets = tw.Cursor(api.user_timeline,
                   screen_name=user,
                   lang="en",
                   since="2010-01-20",
                   until='2021-06-01').items(100)

In [129]:
df= [hashtag_tweet(tweet.entities['hashtags']) for tweet in tweets]
df

[[],
 [],
 [],
 ['EmancipationDay'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['EidAlAdha', 'ldnont', 'EidMubarak'],
 ['KitCenNDP'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['MondayMotivation',
  'JAWR',
  'kitchenerwaterloo',
  'niagararegion',
  'hamont',
  'guelphontario',
  'stcatharines',
  'haldimandcounty'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['DYK'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['NVW', 'volunteer', 'kwawesome', 'volunteer'],
 [],
 ['NVW2021'],
 ['inclusive', 'recruitment', 'immigrant', 'volunteering'],
 [],
 [],
 [],
 [],
 ['RacialEquityWR'],
 [],
 [],
 ['InternationalWomensDay'],
 [],
 [],
 ['ChangeMakers'],
 [],
 [],
 [],
 ['BlackHistoryMonth'],
 [],
 ['BlackHistoryMonth']]

In [116]:
df1= [tweet.entities['hashtags'] for tweet in tweets]
df1

[[],
 [],
 [],
 [{'text': 'EmancipationDay', 'indices': [21, 37]}],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [{'text': 'EidAlAdha', 'indices': [22, 32]},
  {'text': 'ldnont', 'indices': [33, 40]},
  {'text': 'EidMubarak', 'indices': [120, 131]}],
 [{'text': 'KitCenNDP', 'indices': [16, 26]}],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [{'text': 'MondayMotivation', 'indices': [17, 34]},
  {'text': 'JAWR', 'indices': [38, 43]},
  {'text': 'kitchenerwaterloo', 'indices': [44, 62]},
  {'text': 'niagararegion', 'indices': [63, 77]},
  {'text': 'hamont', 'indices': [78, 85]},
  {'text': 'guelphontario', 'indices': [86, 100]},
  {'text': 'stcatharines', 'indices': [101, 114]},
  {'text': 'haldimandcounty', 'indices': [115, 131]}],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [{'text': 'DYK', 'indices': [12, 16]}],
 [],
 [],
 [],
 [],
 [],
 [

In [117]:
hashtags = []
for tweet in tweets:
    tags=[]
    for tag in tweet.entities['hashtags']:
        if tag.get('text') is not None:
            tags.append(tag.get('text'))
    hashtags.append(tags)
    

In [127]:
def hashtag_tweet(entity_arg):
    tags = []
    for tag in entity_arg:
        if tag.get('text') is not None: 
            tags.append(tag.get('text'))
    return tags


In [None]:
[entry for tag in tags for entry in entries if tag in entry]

In [1]:
# list of organizations that the tweets exracted from their timeline:
organizations = [
'@volunteerWR', 
'@RegionWaterloo',
'@SHORECentreWR',
'@kwartgallery',
'@ChildWitnessCtr',
'@PrideStables',
'@hofwatreg',
'@SASCWR',
'@our_SPECTRUM',
'@KWMulticultural',
'@LangsCommunity',
'@vswaterloo',
'@YMCAsofCandKW',
'@cycling_future',
'@ywcakw',
'@kwsphumane',
'@THEMUSEUM',
'@kw_symphony',
'@Reception_House',
'@CambFoodBank',
'@Hope_Spring',
'@ROH_1967',
'@Travind111',
'@SASCWR',
'@vswaterloo',
'@HTsupportWR',
'@oneROOFYouth',
'@thekwcf',
'@Workingcentre',
'@KWMulticultural',
'@WomensCrisisSWR',
'@BBBSWR',
'@BEP_WR',
'@WorkforceWWD',
'@IdeaXchng',
'@KidSportKW',
'@Kinbridge',
'@KMHA_All',
'@KitchLibrary',
'@KitchenerSA',
'@KWDivingClub',
'@KWMLA',
'@Strong_Start',
'@TLGwr',
'@RMFaire',
'@theMTspace']
len(organizations)

46

In [43]:
# Extracting tweets from organizations
print('Runing this code may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.')
print(f'Extracting tweets from {len(organizations)}')
first= True
for organization in organizations:
    if first:
        tweets_df = tweet(organization)
        first = False
    else:
        organization_tweets = tweet(organization)
        tweets_df = pd.concat([tweets_df,organization_tweets])
    

Runing this code may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.
Extracting tweets from 46
retrieving tweets from @volunteerWR.... 
This fucntion may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.
retrieving tweets has been finished.
3215 tweets has been retrieved.
retrieving tweets from @RegionWaterloo.... 
This fucntion may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.
retrieving tweets has been finished.
3239 tweets has been retrieved.
retrieving tweets from @SHORECentreWR.... 
This fucntion may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.
retrieving tweets has been finished.
3193 tweets has been retrieved.
retrieving tweets from @kwartgallery.... 
This fucntion may take several minutes to run.

retrieving tweets has been finished.
3243 tweets has been retrieved.
retrieving tweets from @WorkforceWWD.... 
This fucntion may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.
retrieving tweets has been finished.
1729 tweets has been retrieved.
retrieving tweets from @IdeaXchng.... 
This fucntion may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.
retrieving tweets has been finished.
3233 tweets has been retrieved.
retrieving tweets from @KidSportKW.... 
This fucntion may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.
retrieving tweets has been finished.
1160 tweets has been retrieved.
retrieving tweets from @Kinbridge.... 
This fucntion may take several minutes to run. When the retrieving is finished, you will see the message showing the end of the process.
retrieving tweet

In [44]:
tweets_df.shape

(104638, 11)

In [2]:
tweets_df.head(10)

NameError: name 'tweets_df' is not defined

In [47]:
tweets_df.reset_index(drop=True, inplace=True)

In [51]:
tweets_df.to_json('tweets_all.json')

In [49]:
tweets