## Importing the required libraries

In [1]:
import json
import tweepy
from tweepy import Cursor
from tqdm import tqdm
import jsonlines
import pandas as pd

## Api keys and access tokens

In [2]:
ACCESS_TOKEN = 'xxx'
ACCESS_SECRET = 'xxx'
CONSUMER_KEY = 'xxx'
CONSUMER_SECRET = 'xxx'

## authenticating with Twitter credentials

In [3]:
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)

## api to connect to twitter with my credentials

In [4]:
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, compression=True)

## testing the API syntax
#### -cursor is used to iterate through timelines, user lists pages etc
#### items is set to 1 as we want only 1 tweet to see if it's working

In [5]:
for status in tweepy.Cursor(api.home_timeline, tweet_mode="extended",entities = "extended").items(1):
    x = status._json
    print(x)

{'created_at': 'Sat Apr 06 15:01:10 +0000 2019', 'id': 1114543577684746240, 'id_str': '1114543577684746240', 'full_text': '60+ #Free #Books on #BigData, #DataScience, #DataMining, #MachineLearning, #Python, R, and more #KDN https://t.co/TRyJTymhTM', 'truncated': False, 'display_text_range': [0, 124], 'entities': {'hashtags': [{'text': 'Free', 'indices': [4, 9]}, {'text': 'Books', 'indices': [10, 16]}, {'text': 'BigData', 'indices': [20, 28]}, {'text': 'DataScience', 'indices': [30, 42]}, {'text': 'DataMining', 'indices': [44, 55]}, {'text': 'MachineLearning', 'indices': [57, 73]}, {'text': 'Python', 'indices': [75, 82]}, {'text': 'KDN', 'indices': [96, 100]}], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/TRyJTymhTM', 'expanded_url': 'http://ow.ly/ztin30ohGRJ', 'display_url': 'ow.ly/ztin30ohGRJ', 'indices': [101, 124]}]}, 'source': '<a href="https://www.hootsuite.com" rel="nofollow">Hootsuite Inc.</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': No

# Task
You have to write a python script which can fetch all the tweets(as many as allowed by Twitter
API) done by midas@IIITD twitter handle and dump the responses into JSONlines file.
The other part of your script should be able to parse these JSONline files to display the
following for every tweet in a tabular format.

● The text of the tweet.

● Date and time of the tweet.

● The number of favorites/likes.

● The number of retweets.

● Number of Images present in Tweet. If no image returns None.

## Finding the location of required information

In [7]:
print(x.keys())
print("------")
print(x["user"].keys())

dict_keys(['created_at', 'id', 'id_str', 'full_text', 'truncated', 'display_text_range', 'entities', 'source', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'is_quote_status', 'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'possibly_sensitive', 'possibly_sensitive_appealable', 'lang'])
------
dict_keys(['id', 'id_str', 'name', 'screen_name', 'location', 'description', 'url', 'entities', 'protected', 'followers_count', 'friends_count', 'listed_count', 'created_at', 'favourites_count', 'utc_offset', 'time_zone', 'geo_enabled', 'verified', 'statuses_count', 'lang', 'contributors_enabled', 'is_translator', 'is_translation_enabled', 'profile_background_color', 'profile_background_image_url', 'profile_background_image_url_https', 'profile_background_tile', 'profile_image_url', 'profile_image_url_https', 'profile_banner_url', 'profile_link_c

### tweet['full_text'] gives text of the tweet
### tweet['retweet_count'] gives number of retweets
### tweet['favorite_count'] gives number of favorites
### tweet['created_at'] gives date and time

In [8]:
x["entities"].keys()

dict_keys(['hashtags', 'symbols', 'user_mentions', 'urls'])

### tweet['extended_entities']['media'] has the id's of images if present
length of tweet['extended_entities']['media'] will give us number of images

## checking the rate limit quota:

In [9]:
api.rate_limit_status()

{'rate_limit_context': {'access_token': '1017043411449319425-OlvJw1QZjXjMExLzoDv1P1fmUSguQx'},
 'resources': {'lists': {'/lists/list': {'limit': 15,
    'remaining': 15,
    'reset': 1554564897},
   '/lists/memberships': {'limit': 75, 'remaining': 75, 'reset': 1554564897},
   '/lists/subscribers/show': {'limit': 15,
    'remaining': 15,
    'reset': 1554564897},
   '/lists/members': {'limit': 900, 'remaining': 900, 'reset': 1554564897},
   '/lists/subscriptions': {'limit': 15, 'remaining': 15, 'reset': 1554564897},
   '/lists/show': {'limit': 75, 'remaining': 75, 'reset': 1554564897},
   '/lists/ownerships': {'limit': 15, 'remaining': 15, 'reset': 1554564897},
   '/lists/subscribers': {'limit': 180, 'remaining': 180, 'reset': 1554564897},
   '/lists/members/show': {'limit': 15, 'remaining': 15, 'reset': 1554564897},
   '/lists/statuses': {'limit': 900, 'remaining': 900, 'reset': 1554564897}},
  'application': {'/application/rate_limit_status': {'limit': 180,
    'remaining': 179,
    '

# Part 1: fetching all the tweets(as many as allowed by Twitter API) done by midas@IIITD twitter handle and dump the responses into JSONlines file

## getting information about the user:

In [10]:
item = api.get_user(screen_name='midasIIITD')
print("name: " + item.name)
print("screen_name: " + item.screen_name)
print("description: " + item.description)
print("statuses_count: " + str(item.statuses_count))
print("friends_count: " + str(item.friends_count))
print("followers_count: " + str(item.followers_count))

name: MIDAS IIITD
screen_name: midasIIITD
description: MIDAS is a group of researchers at IIIT-Delhi who study, analyze, and build different multimedia systems for society leveraging multimodal information.
statuses_count: 327
friends_count: 42
followers_count: 233


## making a dictionary of tweets
## - used Cursor to parse through the tweets on the user timeline
## - tweet_mode was kept "extended" to get the full length of the tweet and not just first 140 characters
## - entities was kept "extended" to get id's of all media files present and not just the first
## - used a dictionary as we have to save in the jsonlines format so it would be easier, also search time is faster in dictionaries

In [11]:
tweets = {}
# text = []
tweet_count = 0
for status in tqdm(Cursor(api.user_timeline, screen_name='midasIIITD',tweet_mode="extended",entities = "extended").items()):
    tweet_count += 1
    tweets[tweet_count] = status._json
#     text.append(status.full_text)

#     if tweet_count==100:
#         break

327it [00:11, 29.41it/s]


## checking the dictionary formed

In [12]:
tweets[1].keys()
for key,matter in tweets[1].items():
    print(key,matter)

created_at Fri Apr 05 16:08:37 +0000 2019
id 1114198161562775553
id_str 1114198161562775553
full_text We have emailed the task details to all candidates who have applied to @midasIIITD internship through IIITD portal. Kindly check your spam folder if you have not received the email. We will evaluate all solutions received until April 10 midnight and announce results by April 14.
truncated False
display_text_range [0, 279]
entities {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'midasIIITD', 'name': 'MIDAS IIITD', 'id': 1021355762575073281, 'id_str': '1021355762575073281', 'indices': [71, 82]}], 'urls': []}
source <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>
in_reply_to_status_id None
in_reply_to_status_id_str None
in_reply_to_user_id None
in_reply_to_user_id_str None
in_reply_to_screen_name None
user {'id': 1021355762575073281, 'id_str': '1021355762575073281', 'name': 'MIDAS IIITD', 'screen_name': 'midasIIITD', 'location': 'New Delhi, India', 'd

## here tweets is a dictionary of dictionaries
## - saving the tweets dictionary in .jsonl format
## each tweet gets saved in a line

In [13]:
with jsonlines.open('output.jsonl', mode='w') as writer:
    for k in tweets.keys():
        writer.write(tweets[k])
writer.close()

# Part 2: parse these JSONline files to display the following for every tweet in a tabular format

## reading the .jsonl file using jsonlines library

In [14]:
tweet = []
with jsonlines.open('output.jsonl') as reader:
    for obj in reader:
        tweet.append(obj)
reader.close()

In [15]:
len(tweet[5])

26

## creating a pandas table

In [16]:
table = pd.DataFrame(columns = ['Text','Date', 'Time','Favorites','Retweets','No. of Images'])

## checking if the logic behind number of images work

In [17]:
len(tweet[20]['extended_entities']['media'])

1

## checking the format of date-time column

In [18]:
tweet[1]["created_at"]

'Fri Apr 05 04:05:11 +0000 2019'

## for loop to convert the dictionary into pandas table

In [19]:
for ix in range(len(tweet)):
#     print(type(i))
    i = tweet[ix]
    no_of_images = 0
    if "extended_entities" in i:
        no_of_images = len(i['extended_entities']['media'])
    date_time = i['created_at'].split()                      #splitting the data-time column
    date = date_time[1]+" "+date_time[2]+" "+ date_time[-1]  #date column
    time = date_time[3]+date_time[4]                         #time column
    table.loc[ix] = [i["full_text"],date,time,i['favorite_count'],i['retweet_count'],no_of_images]   #adding the rows

## printing the table

In [20]:
table

Unnamed: 0,Text,Date,Time,Favorites,Retweets,No. of Images
0,We have emailed the task details to all candid...,Apr 05 2019,16:08:37+0000,7,1,0
1,RT @rfpvjr: Our NAACL paper on polarization in...,Apr 05 2019,04:05:11+0000,0,16,0
2,RT @kdnuggets: Effective Transfer Learning For...,Apr 05 2019,04:04:43+0000,0,10,1
3,RT @stanfordnlp: What’s new in @Stanford CS224...,Apr 03 2019,18:31:53+0000,0,55,0
4,RT @DeepMindAI: Today we're releasing a large-...,Apr 03 2019,17:04:32+0000,0,841,0
5,RT @ylecun: Congratulations Jitendra Malik !\n...,Apr 03 2019,09:03:40+0000,0,16,0
6,RT @IIITDelhi: Another chance to take admissio...,Apr 03 2019,07:46:02+0000,0,4,0
7,Dear @midasIIITD internship candidates who hav...,Apr 02 2019,04:20:13+0000,8,1,0
8,Looking forward to your paper submission to @I...,Apr 02 2019,02:44:54+0000,5,1,0
9,RT @ngrams: Reproducibility in multimedia rese...,Apr 02 2019,02:35:44+0000,0,7,0
