In [1]:
import json
import pandas as pd
import ast
import os
from datetime import datetime
import pytz
import pymongo
from pymongo import MongoClient

In [2]:
client = MongoClient("mongodb+srv://<user>:<password>@cluster0.wkyhu.mongodb.net/?retryWrites=true&w=majority")

In [3]:
db = client.get_database("twitter_db")

In [4]:
records = db.tweets_data

In [5]:
# Load the JSON data from corona-out-3 file
with open("../corona-out-3", "r") as f1:
    data = []
    for line in f1:
        try:
            item = json.loads(line)
            data.append(item)
        except:
            # if there is an error loading the json of the tweet, skip
            continue

In [6]:
# Load the JSON data from corona-out-2 file
with open("../corona-out-2", "r") as f2:
    for line in f2:
        try:
            item = json.loads(line)
            data.append(item)
        except:
            # if there is an error loading the json of the tweet, skip
            continue

In [7]:
tweets = []
users = []

for tweet in data :
    tweet_id = tweet['id']
    created_at = tweet['created_at']
    in_reply_to_status_id = tweet['in_reply_to_status_id']
    in_reply_to_user_id = tweet['in_reply_to_user_id']
    source = tweet['source']
    is_quote = tweet['is_quote_status']
    quote = None
    retweet = None

    ##################### user information #####################
    
    user_id = tweet['user']['id']
    user_name = tweet['user']['name']
    screen_name = tweet['user']['screen_name']
    location = tweet['user']['location']
    description = tweet['user']['description']
    verified =  tweet['user']['verified'] 
    followers_count = tweet['user']['followers_count'] 
    friends_count = tweet['user']['friends_count'] 
    listed_count = tweet['user']['listed_count'] 
    favourites_count =  tweet['user']['favourites_count'] 
    statuses_count =  tweet['user']['statuses_count']
    twitter_join_date = tweet['user']['created_at']
    language = tweet['user']['lang']
    following = tweet['user']['following']
    
     
    ##################### segregate retweets, tweets and quoted tweets information #####################
    
    try:
        if is_quote:

            original_tweet_id = tweet['quoted_status']['id']
            original_tweet_user_id = tweet['quoted_status']['user']['id']
            original_tweet_user_name = tweet['quoted_status']['user']['name']
            original_post_time = tweet['quoted_status']['created_at']

            original_tweet_quote_count = tweet['quoted_status']['quote_count']
            original_tweet_reply_count = tweet['quoted_status']['reply_count']
            original_tweet_retweet_count = tweet['quoted_status']['retweet_count']
            original_tweet_favorite_count = tweet['quoted_status']['favorite_count']

            original_tweet_hashtags = [hashtag['text'] for hashtag in tweet['quoted_status']['entities']['hashtags']]
            original_tweet_urls = [url['url'] for url in tweet['quoted_status']['entities']['urls']]
            original_tweet_mentions = [mention['screen_name'] for mention in tweet['quoted_status']['entities']['user_mentions']]

            quote_media = {'hashtags': original_tweet_hashtags, 
                           'urls': original_tweet_urls, 
                           'mentions': original_tweet_mentions}

            quote = {'tweet_id': original_tweet_id, 
                     'user_id': original_tweet_user_id, 
                     'user_name': original_tweet_user_name, 
                     'quote_count': original_tweet_quote_count, 
                     'reply_count': original_tweet_reply_count, 
                     'retweet_count': original_tweet_retweet_count, 
                     'favorite_count': original_tweet_favorite_count, 
                     'media': quote_media}


        if (tweet['text'].startswith('RT')):
            is_retweet = True

            if not is_quote:

                original_tweet_id = tweet['retweeted_status']['id']
                original_tweet_user_id = tweet['retweeted_status']['user']['id']
                original_tweet_user_name = tweet['retweeted_status']['user']['name']
                original_post_time = tweet['retweeted_status']['created_at']

                original_tweet_quote_count = tweet['retweeted_status']['quote_count']
                original_tweet_reply_count = tweet['retweeted_status']['reply_count']
                original_tweet_retweet_count = tweet['retweeted_status']['retweet_count']
                original_tweet_favorite_count = tweet['retweeted_status']['favorite_count']

                original_tweet_hashtags = [hashtag['text'] for hashtag in tweet['retweeted_status']['entities']['hashtags']]
                original_tweet_urls = [url['url'] for url in tweet['retweeted_status']['entities']['urls']]
                original_tweet_mentions = [mention['screen_name'] for mention in tweet['retweeted_status']['entities']['user_mentions']]

                retweet_media = {'hashtags': original_tweet_hashtags, 
                                 'urls': original_tweet_urls, 
                                 'mentions': original_tweet_mentions}


            else:
                retweet_media = quote_media

            # try adding extended tweet if it exists
            try:
                text = tweet['retweeted_status']['extended_tweet']['full_text']  
            except : 
                try:
                    text = tweet['retweeted_status']['text']
                except: 
                    continue

            retweet = {'tweet_id': original_tweet_id, 
                       'user_id': original_tweet_user_id, 
                       'user_name': original_tweet_user_name, 
                       'quote_count': original_tweet_quote_count, 
                       'reply_count': original_tweet_reply_count, 
                       'retweet_count': original_tweet_retweet_count, 
                       'favorite_count': original_tweet_favorite_count,
                       'created_at': original_post_time, 
                       'media': retweet_media}

        else:
            is_retweet = False

            # try adding extended tweet if it exists
            try :
                text = tweet['extended_tweet']['full_text']
            except:
                text = tweet['text']


        ##################### extra info about tweets #####################

        favorited = tweet['favorited']
        favorite_count = tweet['favorite_count']
        urls = [url['url'] for url in tweet['entities']['urls']]
        hashtags = [tag['text'] for tag in tweet['entities']['hashtags']]
        mentions = [mention['screen_name'] for mention in tweet['entities']['user_mentions']]

        media = {'hashtags': hashtags, 'urls': urls, 'mentions': mentions}

        quote_count = tweet['quote_count']
        reply_count = tweet['reply_count']
        retweet_count = tweet['retweet_count']
        favorite_count = tweet['favorite_count']

        try:
            # Making a JSON to be sent to collection.
            mongo_data = {'tweet_id': tweet_id, 'user': user_id, 
                          'name': user_name, 'date': created_at, 
                          'source': source, 'text': text, 
                          'in_reply_to_status_id': in_reply_to_status_id, 
                          'in_reply_to_user_id': in_reply_to_user_id, 
                          'is_retweet': is_retweet, 'is_quote': is_quote,
                          'retweet': retweet, 'quote': quote, 'media': media, 
                          'favorited': favorited, 'quote_count': quote_count, 
                          'reply_count': reply_count,'retweet_count': retweet_count} 

            records.insert_one(mongo_data)

        # Except if it's already there
        except pymongo.errors.DuplicateKeyError as e:
            print("Duplicate Key")


        ##################### combine everything #####################

        users.append({'user_id': user_id, 'name': user_name, 
                      'screen_name': screen_name, 'date': created_at,
                      'twitter_join_date': twitter_join_date, 'location': location, 
                      'description': description, 'verified': verified, 
                      'followers_count': followers_count,'friends_count': friends_count,
                      'listed_count': listed_count, 'favourites_count': favourites_count,
                      'language': language})

        tweets.append({'tweet_id': tweet_id, 'user': user_id, 
                       'name': user_name, 'date': created_at,
                       'source': source, 'original_tweet_id': original_tweet_id,
                       'original_tweet_user_id': original_tweet_user_id, 
                       'text' : text, 'quote_count': quote_count,
                       'reply_count': reply_count,'retweet_count': retweet_count, 
                       'favorite_count': favorite_count,
                       'favorited': favorited, 'urls': urls, 'hashtags': hashtags,
                       'mentions': mentions})
    except:
        continue


In [8]:
# Create a pandas DataFrame from the parsed data
df_users = pd.DataFrame(users)
df_tweets = pd.DataFrame(tweets)

In [9]:
print(df_tweets.shape)
print(df_users.shape)

(120390, 16)
(120392, 13)


In [10]:
df_users.head()

Unnamed: 0,user_id,name,screen_name,date,twitter_join_date,location,description,verified,followers_count,friends_count,listed_count,favourites_count,language
0,804046791348015107,Bi Sex Uau,B_King69,Sat Apr 25 12:21:41 +0000 2020,Wed Nov 30 19:37:48 +0000 2016,"Acre, Brasil",se for da minha família já pode voltar daq mesmo,False,89,173,0,5446,
1,2242948745,Thomas Krause,tho1965,Sat Apr 25 12:21:41 +0000 2020,Wed Dec 25 09:13:33 +0000 2013,,Sportredakteur @nordkurier 🏃‍♂️🚴‍♂️⚽️,False,173,685,9,2184,
2,908326492718764034,शचीन्द्र पाण्डेय,im_S_pandey,Sat Apr 25 12:21:42 +0000 2020,Thu Sep 14 13:48:06 +0000 2017,Amethi Uttar Pradesh,Official Twitter Handel Shachindra Pandey (@im...,False,2362,202,3,30668,
3,2929344220,Ralf Schmitz,RusticusArat,Sat Apr 25 12:21:42 +0000 2020,Thu Dec 18 10:19:26 +0000 2014,🇩4790 Provinz,"BWLer,ex Offz,Tw meistens zwischen Tür & Angel...",False,778,733,2,32024,
4,1206650133976408064,Büşra Öztaş,schrodingerk42,Sat Apr 25 12:21:42 +0000 2020,Mon Dec 16 18:59:53 +0000 2019,,,False,318,220,0,1974,


In [11]:
df_tweets.head()

Unnamed: 0,tweet_id,user,name,date,source,original_tweet_id,original_tweet_user_id,text,quote_count,reply_count,retweet_count,favorite_count,favorited,urls,hashtags,mentions
0,1254022772558368768,908326492718764034,शचीन्द्र पाण्डेय,Sat Apr 25 12:21:42 +0000 2020,"<a href=""http://twitter.com/download/android"" ...",1253949413191344128,207809313,India’s war with Corona is ongoing.\n\nPlay yo...,0,0,0,0,False,[],[],[BJP4India]
1,1254022772575043586,2929344220,Ralf Schmitz,Sat Apr 25 12:21:42 +0000 2020,"<a href=""https://mobile.twitter.com"" rel=""nofo...",1254016434658848769,928501014,Was sollen 150 Euro Computerzuschuss bringen? ...,0,0,0,0,False,[https://t.co/wRAGYwMovO],[],[]
2,1254022772877131777,1206650133976408064,Büşra Öztaş,Sat Apr 25 12:21:42 +0000 2020,"<a href=""http://twitter.com/download/android"" ...",1252576316135739392,1206650133976408064,@ozkan_yalim @DurmusYillmaz \nAçık kapalı görü...,0,0,0,0,False,[],[],"[schrodingerk42, ozkan_yalim, DurmusYillmaz]"
3,1254022773149589510,1248123252,minhyuk.,Sat Apr 25 12:21:42 +0000 2020,"<a href=""http://twitter.com/download/android"" ...",1253992905703862272,858859031464751104,VIDEO | 25.04.20\n\n&gt; Monsta X appears on Y...,0,0,0,0,False,[],[],[MonstaXEurope]
4,1254022773858545665,50993809,🇮🇹Henry Whites♥️,Sat Apr 25 12:21:42 +0000 2020,"<a href=""http://twitter.com/download/android"" ...",1254010851142569984,761787475,Morti COL Corona non PER il Corona.\n\nCONTE P...,0,0,0,0,False,[],[],[gustinicchi]


In [12]:
# define the input and output formats
input_format = '%a %b %d %H:%M:%S %z %Y'
output_format = '%Y-%m-%d %H:%M:%S %Z%z'

In [13]:
df_users['date'] = pd.to_datetime(df_users['date'], format=input_format)
df_tweets['date'] = pd.to_datetime(df_tweets['date'], format=input_format)

In [14]:
df_users['twitter_join_date'] = pd.to_datetime(df_users['twitter_join_date'], format=input_format)

In [15]:
df_users.to_csv('data/users.csv', index=False)
df_tweets.to_csv('data/tweets.csv', index=False)