In [1]:
from requests_oauthlib import OAuth1Session

import jsonlines
import pandas as pd
import secrets

In [2]:
# Create an OAuth1Session for accessing endpoint
twitter = OAuth1Session(
    secrets.CLIENT_KEY,
    client_secret=secrets.CLIENT_SECRET,
    resource_owner_key=secrets.RESOURCE_TOKEN,
    resource_owner_secret=secrets.RESOURCE_SECRET)

### URL Configuration

The URL has been configured with the following in context:
* `screen_name = midasIIITD`: Specifies the username, the tweets of whom need be fetched.
* `count = 200`: Specifies the no. of tweets to be fetched in one tweet.
* `trim_user = true`: Don't fetch user information along with tweets as it is irrelevant anyways.
* `tweet_mode = extended`: This mode allows us to access additional media information and the full text of the tweet.

In [3]:
screen_name = 'midasIIITD'

url = 'https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name={}&count=200&trim_user=true&tweet_mode=extended'.format(screen_name)

In [4]:
tweets = twitter.get(url).json()
attempts = 1

### Pagination

Pagination of tweets is a nuanced process because of real-time services. A reliable way to paginate through tweets is to assign each tweet a unique ID, such that a newer tweet will always have an ID greater than all the tweets which happened before it. We can now use this ID as key for easy pagination. This key here is known as `max_id` and is directly accesible through the JSON response.

In [5]:
while len(tweets) < 900 and attempts <= 5:
    max_id = str(tweets[-1]['id'] - 1)
    tweets = tweets + twitter.get(url + '&max_id={}'.format(max_id)).json()
    attempts = attempts + 1    

- [x] DEBUG Check: Test that there are no duplicate tweets

In [6]:
d_tweets = []
for tweet in tweets:
    if not (tweet in d_tweets):
        d_tweets.append(tweet)
# pprint(d_tweets)
print(len(d_tweets) == len(tweets))

True


In [7]:
with jsonlines.open('tweets.jsonl', mode='w') as writer:
    writer.write_all(tweets)

- [x] DEBUG Check: Test that tweets can be retrieved completely

In [8]:
with jsonlines.open('tweets.jsonl') as f:
    test_open = [data for data in f]
print(test_open == tweets)

True


In [9]:
demanded_info = ['full_text', 'created_at', 'favorite_count', 'retweet_count', 'extended_entities']

tweets_info = []
for tweet in tweets:
    if not (tweet.get('retweeted_status') is None):
        tweets_info.append({info_holder: tweet['retweeted_status'].get(info_holder) for info_holder in demanded_info})
        continue
    tweets_info.append({info_holder: tweet.get(info_holder) for info_holder in demanded_info})

In [10]:
for info in tweets_info:
    count = 0
    # Since info['extended_entities'] = None for many entries here,
    # we cannot use info.get('extended_entities', {})
    ee = info.get('extended_entities') or {}
    media = ee.get('media') or []
    for medium in media:
        if medium.get('type') == 'photo':
            count = count + 1
    info['image_count'] = str(count or None)
    del info['extended_entities']

In [11]:
# Display full text
pd.set_option('display.max_colwidth', -1)
pd.set_option('max_rows', 900)

In [12]:
df = pd.DataFrame(tweets_info)
df.columns = ['Timestamp', 'Favorites', 'Text', 'Image Count', 'Retweets']
display(df)

Unnamed: 0,Timestamp,Favorites,Text,Image Count,Retweets
0,Tue Mar 26 05:54:49 +0000 2019,1,@IEEEBigMM19 @ACMMM19 and 6 days left for workshop proposal in @IEEEBigMM19.\n\nContact @cchatto for any query.,,0
1,Tue Mar 26 05:18:13 +0000 2019,3,Hurry Up!\n6 Days left for Abstract Submission in @ACMMM19 \n45 Days left for Regular Paper Submission in @IEEEBigMM19 .\n\nHectic time ahead or Multimedia Researchers :),,3
2,Mon Mar 25 13:01:57 +0000 2019,16,Congratulations @midasIIITD students Simra Shahid @Simcyy and Nilay Shrivastava @NilayShri on getting selected for a research internship at Adobe in this summer. \n\n#MIDAS #Achievment #Research #Summer #Internship https://t.co/WdF663EB5y,2.0,1
3,Sun Mar 24 18:44:01 +0000 2019,8,"The last date for submitting a solution for the @midasIIITD internship task is 26th March midnight. We will not accept solutions submitted after the deadline. \nThus, if you have not submitted your solution yet then kindly do so before the deadline. \n#Summer #Research #Internship",,3
4,Mon Mar 18 06:42:56 +0000 2019,4,@IIITDelhi invites application from Foreign Nationals/PIOs/NRIs/OCIs for admission to its https://t.co/u6kIuhIrGM and https://t.co/Q0ibR1xaDl Programs through IIIT-Delhi admission process for AY 2019-20. Visit Website for more information : https://t.co/CGWqrughDw https://t.co/c2o3593dbR,1.0,4
5,Sun Mar 24 11:34:27 +0000 2019,4,"One more week is left to submit the workshop proposal to @IEEEBigMM19. \nLooking forward to your submission. For any query, contact workshop co-chair Dr. Chiranjoy Chattopadhyay @cchatto or TPC co-chair @RatnRajiv",,0
6,Sat Mar 23 05:17:50 +0000 2019,6,We are honored to have Dr. Chang Wen Chen (Chinese University Hong Kong @CUHKofficial ) and Dr. Roger Zimmermann (National University Singapore @NUSingapore ) as our General co-chairs.,,5
7,Sun Mar 24 05:58:27 +0000 2019,13,"Distinguished researchers Dr. Rajiv Ratn Shah @RatnRajiv (@IIITDelhi , @midasIIITD ), Dr. Jianquan Liu (NEC Japan) and Dr. Vivek Singh (Rutgers University @RutgersU ) will be our technical program co-chairs.",,3
8,Wed Mar 20 08:19:24 +0000 2019,1,@IEEEBigMM19 is also available on Facebook now. \nLIKE its Facebook page https://t.co/B3Q0zmmzXb to get the regular updates. \nCheck more details at https://t.co/w9ZymoPisk \n\n#IEEE #BigMM19 #Big #Multimedia #Singapore,,1
9,Tue Mar 19 18:15:01 +0000 2019,6,"BigMM 2019 : IEEE BigMM 2019 – Call for Workshop Proposals \n\nhttps://t.co/I4vqf8FE6K … \nWhen: Sep 11, 2019 - Sep 13, 2019 \nWhere: Singapore\nSubmission Deadline: Apr 1, 2019 \nNotification Due: Apr 10, 2019 \n\n#IEEE #BigData #Singapore #Multimedia",,5


# That's it. Yay!