In [None]:
from requests_oauthlib import OAuth1Session

import jsonlines
import pandas as pd
import secrets

In [None]:
# Create an OAuth1Session for accessing endpoint
twitter = OAuth1Session(
    secrets.CLIENT_KEY,
    client_secret=secrets.CLIENT_SECRET,
    resource_owner_key=secrets.RESOURCE_TOKEN,
    resource_owner_secret=secrets.RESOURCE_SECRET)

### URL Configuration

The URL has been configured with the following in context:
* `screen_name = midasIIITD`: Specifies the username, the tweets of whom need be fetched.
* `count = 200`: Specifies the no. of tweets to be fetched in one tweet.
* `trim_user = true`: Don't fetch user information along with tweets as it is irrelevant anyways.
* `tweet_mode = extended`: This mode allows us to access additional media information and the full text of the tweet.

In [None]:
screen_name = 'midasIIITD'

url = 'https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name={}&count=200&trim_user=true&tweet_mode=extended'.format(screen_name)

In [None]:
tweets = twitter.get(url).json()
attempts = 1

### Pagination

Pagination of tweets is a nuanced process because of real-time services. A reliable way to paginate through tweets is to assign each tweet a unique ID, such that a newer tweet will always have an ID greater than all the tweets which happened before it. We can now use this ID as key for easy pagination. This key here is known as `max_id` and is directly accesible through the JSON response.

In [None]:
while len(tweets) < 900 and attempts <= 5:
    max_id = str(tweets[-1]['id'] - 1)
    tweets = tweets + twitter.get(url + '&max_id={}'.format(max_id)).json()
    attempts = attempts + 1    

- [x] DEBUG Check: Test that there are no duplicate tweets

In [None]:
d_tweets = []
for tweet in tweets:
    if not (tweet in d_tweets):
        d_tweets.append(tweet)
# pprint(d_tweets)
print(len(d_tweets) == len(tweets))

In [None]:
with jsonlines.open('tweets.jsonl', mode='w') as writer:
    writer.write_all(tweets)

- [x] DEBUG Check: Test that tweets can be retrieved completely

In [None]:
with jsonlines.open('tweets.jsonl') as f:
    test_open = [data for data in f]
print(test_open == tweets)

In [None]:
demanded_info = ['full_text', 'created_at', 'favorite_count', 'retweet_count', 'extended_entities']

tweets_info = []
for tweet in tweets:
    if not (tweet.get('retweeted_status') is None):
        tweets_info.append({info_holder: tweet['retweeted_status'].get(info_holder) for info_holder in demanded_info})
        continue
    tweets_info.append({info_holder: tweet.get(info_holder) for info_holder in demanded_info})

In [None]:
for info in tweets_info:
    count = 0
    # Since info['extended_entities'] = None for many entries here,
    # we cannot use info.get('extended_entities', {})
    ee = info.get('extended_entities') or {}
    media = ee.get('media') or []
    for medium in media:
        if medium.get('type') == 'photo':
            count = count + 1
    info['image_count'] = str(count or None)
    del info['extended_entities']

In [None]:
# Display full text
pd.set_option('display.max_colwidth', -1)
pd.set_option('max_rows', 900)

In [None]:
df = pd.DataFrame(tweets_info)
df.columns = ['Timestamp', 'Favorites', 'Text', 'Image Count', 'Retweets']
display(df)

# That's it. Yay!