In [1]:
# Import libraries

import json
import pandas as pd

In [2]:
# Define functions to separate user and tweet information

def get_user(user):
    user = {
        'id': user['id'],
        'name': user['name'],
        'screen_name': user['screen_name'],
        'location': user['location'],
        'url': user['url'],
        'description': user['description'],
        'protected': user['protected'],
        'verified': user['verified'],
        'followers_count': user['followers_count'],
        'friends_count': user['friends_count'],
        'listed_count': user['listed_count'],
        'favourites_count': user['favourites_count'],
        'statuses_count': user['statuses_count'],
        'created_at': user['created_at'],
        'geo_enabled': user['geo_enabled'],
        'profile_image_url_https': user['profile_image_url_https'],
        # 'profile_banner_url': user['profile_banner_url']
    }

    return user
    
def get_tweet(data):
    tweet = {
        'created_at': data['created_at'],
        'id': data['id'],
        'text': data['text'],
        'source': data['source'],
        'user_id': data['user']['id'],
        'quote_count': data['quote_count'],
        'reply_count': data['reply_count'],
        'retweet_count': data['retweet_count'],
        'favorite_count': data['favorite_count'],
        'hashtags': data['entities']['hashtags'],
        'lang': data['lang']
    }
    
    return tweet

In [3]:
def get_data(file):

    # Create variables to store data
    tweet_data = []  
    user_data = []   
    
    with open(file, "r") as f1:
        for line in f1:
            if line.strip():
                data = json.loads(line)
                
                user = get_user(data["user"])
                tweet = get_tweet(data)

                # Process retweeted status if available
                retweet_status = data.get('retweeted_status')
                if retweet_status:
                    user_rt = get_user(retweet_status["user"])
                    tweet_rt = get_tweet(retweet_status)
                    tweet["retweet_id"] = tweet_rt['id']        # If the tweet is a retweet, add its id
                    tweet_rt["retweet_id"] = None               # For a retweeted tweet, id is None
                    
                    user_data.append(user_rt)
                    tweet_data.append(tweet_rt)
                else:
                    tweet["retweet_id"] = None                  # If the tweet is not a retweet, then id is None
    
                user_data.append(user)
                tweet_data.append(tweet)
                
    return user_data, tweet_data

In [4]:
# Get data for users and tweets

user_data, tweet_data = get_data("corona-out-3")

In [5]:
# Convert list of user data to DataFrame

user_dataframe = pd.DataFrame(user_data).drop_duplicates(['id'])
user_dataframe

Unnamed: 0,id,name,screen_name,location,url,description,protected,verified,followers_count,friends_count,listed_count,favourites_count,statuses_count,created_at,geo_enabled,profile_image_url_https
0,804046791348015107,Bi Sex Uau,B_King69,"Acre, Brasil",https://www.instagram.com/?hl=pt-br,se for da minha família já pode voltar daq mesmo,False,False,89,173,0,5446,4728,Wed Nov 30 19:37:48 +0000 2016,False,https://pbs.twimg.com/profile_images/121079497...
1,2242948745,Thomas Krause,tho1965,,,Sportredakteur @nordkurier 🏃‍♂️🚴‍♂️⚽️,False,False,173,685,9,2184,1865,Wed Dec 25 09:13:33 +0000 2013,True,https://pbs.twimg.com/profile_images/123527187...
2,207809313,BJP,BJP4India,"6-A, Deen Dayal Upadhyay Marg,",http://www.bjp.org,Official Twitter account of the Bharatiya Jana...,False,True,13481667,3,3018,0,200315,Tue Oct 26 02:19:07 +0000 2010,True,https://pbs.twimg.com/profile_images/812531108...
3,908326492718764034,शचीन्द्र पाण्डेय,im_S_pandey,Amethi Uttar Pradesh,,Official Twitter Handel Shachindra Pandey (@im...,False,False,2362,202,3,30668,48906,Thu Sep 14 13:48:06 +0000 2017,True,https://pbs.twimg.com/profile_images/125281991...
4,2929344220,Ralf Schmitz,RusticusArat,🇩4790 Provinz,,"BWLer,ex Offz,Tw meistens zwischen Tür & Angel...",False,False,778,733,2,32024,30551,Thu Dec 18 10:19:26 +0000 2014,False,https://pbs.twimg.com/profile_images/110369011...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163004,92819550,santosh,skpandey77,"Thanjavur, India",,,False,False,24,34,0,534,309,Thu Nov 26 19:32:02 +0000 2009,False,https://abs.twimg.com/sticky/default_profile_i...
163008,2279976427,Venugopi,Venu7630,,,,False,False,3,35,0,83,42,Tue Jan 07 03:50:17 +0000 2014,False,https://pbs.twimg.com/profile_images/810515161...
163010,4890378807,𝔻𝕒𝕪𝕤𝕖 gab.ai/AvelarDayse,AvelarDayse,Brazil,,#JesusChrist❤︎︎ #QAnonPsyOp #StopZion࿕ #BanIsl...,False,False,3020,2306,2,29381,12704,Tue Feb 09 01:38:25 +0000 2016,False,https://pbs.twimg.com/profile_images/117139421...
163014,872453029,R.A.shah,Rashah007,"Lucknow, India",,,False,False,881,1921,0,22823,9514,Wed Oct 10 20:23:59 +0000 2012,True,https://pbs.twimg.com/profile_images/124794246...


In [6]:
# Convert list of tweet data to DataFrame

tweet_dataframe = pd.DataFrame(tweet_data).drop_duplicates(['id'])
tweet_dataframe

Unnamed: 0,created_at,id,text,source,user_id,quote_count,reply_count,retweet_count,favorite_count,hashtags,lang,retweet_id
0,Sat Apr 25 12:21:41 +0000 2020,1254022770679320576,"É isto, ou vou morrer sem ar ou com o corona h...","<a href=""http://twitter.com/download/android"" ...",804046791348015107,0,0,0,0,[],pt,
1,Sat Apr 25 12:21:41 +0000 2020,1254022770746372096,Schöne Runde mit dem Rennrad ✌️\n#sport #coron...,"<a href=""http://twitter.com/download/iphone"" r...",2242948745,0,0,0,0,"[{'text': 'sport', 'indices': [32, 38]}, {'tex...",de,
2,Sat Apr 25 07:30:12 +0000 2020,1253949413191344128,India’s war with Corona is ongoing.\n\nPlay yo...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",207809313,16,101,340,1870,[],en,
3,Sat Apr 25 12:21:42 +0000 2020,1254022772558368768,RT @BJP4India: India’s war with Corona is ongo...,"<a href=""http://twitter.com/download/android"" ...",908326492718764034,0,0,0,0,[],en,1.253949e+18
4,Sat Apr 25 12:21:42 +0000 2020,1254022772575043586,Was sollen 150 Euro Computerzuschuss bringen? ...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",2929344220,0,0,0,0,[],de,
...,...,...,...,...,...,...,...,...,...,...,...,...
163008,Sat Apr 25 14:48:38 +0000 2020,1254059751379361793,@VSReddy_MP I think wantedly you ate damaging ...,"<a href=""http://twitter.com/download/android"" ...",2279976427,0,0,0,0,[],en,
163010,Sat Apr 25 14:48:38 +0000 2020,1254059751949942784,RT @DeepStateExpose: RETWEET! 5G is the real s...,"<a href=""http://twitter.com/download/iphone"" r...",4890378807,0,0,0,0,[],en,1.254060e+18
163012,Sat Apr 25 14:48:38 +0000 2020,1254059752134455296,RT @bongofive: Diamond atangaza kuzilipia kodi...,"<a href=""http://twitter.com/download/android"" ...",1196389874351431680,0,0,0,0,[],ro,1.254038e+18
163014,Sat Apr 25 14:48:38 +0000 2020,1254059751945596930,RT @AjeetSonwarsha: “धार्मिक” एक्सपर्ट लोगों स...,"<a href=""http://twitter.com/download/android"" ...",872453029,0,0,0,0,[],hi,1.253989e+18


In [7]:
def get_date(df):
    df["created_at"] = pd.to_datetime(df["created_at"])
    df['day'] = df['created_at'].dt.day
    df['month'] = df['created_at'].dt.month
    df['year'] = df['created_at'].dt.year
    df['time'] = df['created_at'].dt.time
    df['date'] = df['created_at'].dt.date
    
    return df
    

In [8]:
user_dataframe = get_date(user_dataframe)
tweet_dataframe = get_date(tweet_dataframe)

  df["created_at"] = pd.to_datetime(df["created_at"])
  df["created_at"] = pd.to_datetime(df["created_at"])


In [9]:
# Create a new column to store hashtags

all_tags = []    # Store tags for all rows
for hashtags in tweet_dataframe["hashtags"]: 
    tags = []
    if hashtags:
        for tag in hashtags:
            text = tag['text']
            tags.append(text)
            
        all_tags.append(tags)
    else:
        all_tags.append([])

tweet_dataframe["hashtag_list"] = all_tags

In [10]:
tweet_dataframe

Unnamed: 0,created_at,id,text,source,user_id,quote_count,reply_count,retweet_count,favorite_count,hashtags,lang,retweet_id,day,month,year,time,date,hashtag_list
0,2020-04-25 12:21:41+00:00,1254022770679320576,"É isto, ou vou morrer sem ar ou com o corona h...","<a href=""http://twitter.com/download/android"" ...",804046791348015107,0,0,0,0,[],pt,,25,4,2020,12:21:41,2020-04-25,[]
1,2020-04-25 12:21:41+00:00,1254022770746372096,Schöne Runde mit dem Rennrad ✌️\n#sport #coron...,"<a href=""http://twitter.com/download/iphone"" r...",2242948745,0,0,0,0,"[{'text': 'sport', 'indices': [32, 38]}, {'tex...",de,,25,4,2020,12:21:41,2020-04-25,"[sport, corona]"
2,2020-04-25 07:30:12+00:00,1253949413191344128,India’s war with Corona is ongoing.\n\nPlay yo...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",207809313,16,101,340,1870,[],en,,25,4,2020,07:30:12,2020-04-25,[]
3,2020-04-25 12:21:42+00:00,1254022772558368768,RT @BJP4India: India’s war with Corona is ongo...,"<a href=""http://twitter.com/download/android"" ...",908326492718764034,0,0,0,0,[],en,1.253949e+18,25,4,2020,12:21:42,2020-04-25,[]
4,2020-04-25 12:21:42+00:00,1254022772575043586,Was sollen 150 Euro Computerzuschuss bringen? ...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",2929344220,0,0,0,0,[],de,,25,4,2020,12:21:42,2020-04-25,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163008,2020-04-25 14:48:38+00:00,1254059751379361793,@VSReddy_MP I think wantedly you ate damaging ...,"<a href=""http://twitter.com/download/android"" ...",2279976427,0,0,0,0,[],en,,25,4,2020,14:48:38,2020-04-25,[]
163010,2020-04-25 14:48:38+00:00,1254059751949942784,RT @DeepStateExpose: RETWEET! 5G is the real s...,"<a href=""http://twitter.com/download/iphone"" r...",4890378807,0,0,0,0,[],en,1.254060e+18,25,4,2020,14:48:38,2020-04-25,[]
163012,2020-04-25 14:48:38+00:00,1254059752134455296,RT @bongofive: Diamond atangaza kuzilipia kodi...,"<a href=""http://twitter.com/download/android"" ...",1196389874351431680,0,0,0,0,[],ro,1.254038e+18,25,4,2020,14:48:38,2020-04-25,[]
163014,2020-04-25 14:48:38+00:00,1254059751945596930,RT @AjeetSonwarsha: “धार्मिक” एक्सपर्ट लोगों स...,"<a href=""http://twitter.com/download/android"" ...",872453029,0,0,0,0,[],hi,1.253989e+18,25,4,2020,14:48:38,2020-04-25,[]


In [11]:
user_dataframe

Unnamed: 0,id,name,screen_name,location,url,description,protected,verified,followers_count,friends_count,...,favourites_count,statuses_count,created_at,geo_enabled,profile_image_url_https,day,month,year,time,date
0,804046791348015107,Bi Sex Uau,B_King69,"Acre, Brasil",https://www.instagram.com/?hl=pt-br,se for da minha família já pode voltar daq mesmo,False,False,89,173,...,5446,4728,2016-11-30 19:37:48+00:00,False,https://pbs.twimg.com/profile_images/121079497...,30,11,2016,19:37:48,2016-11-30
1,2242948745,Thomas Krause,tho1965,,,Sportredakteur @nordkurier 🏃‍♂️🚴‍♂️⚽️,False,False,173,685,...,2184,1865,2013-12-25 09:13:33+00:00,True,https://pbs.twimg.com/profile_images/123527187...,25,12,2013,09:13:33,2013-12-25
2,207809313,BJP,BJP4India,"6-A, Deen Dayal Upadhyay Marg,",http://www.bjp.org,Official Twitter account of the Bharatiya Jana...,False,True,13481667,3,...,0,200315,2010-10-26 02:19:07+00:00,True,https://pbs.twimg.com/profile_images/812531108...,26,10,2010,02:19:07,2010-10-26
3,908326492718764034,शचीन्द्र पाण्डेय,im_S_pandey,Amethi Uttar Pradesh,,Official Twitter Handel Shachindra Pandey (@im...,False,False,2362,202,...,30668,48906,2017-09-14 13:48:06+00:00,True,https://pbs.twimg.com/profile_images/125281991...,14,9,2017,13:48:06,2017-09-14
4,2929344220,Ralf Schmitz,RusticusArat,🇩4790 Provinz,,"BWLer,ex Offz,Tw meistens zwischen Tür & Angel...",False,False,778,733,...,32024,30551,2014-12-18 10:19:26+00:00,False,https://pbs.twimg.com/profile_images/110369011...,18,12,2014,10:19:26,2014-12-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163004,92819550,santosh,skpandey77,"Thanjavur, India",,,False,False,24,34,...,534,309,2009-11-26 19:32:02+00:00,False,https://abs.twimg.com/sticky/default_profile_i...,26,11,2009,19:32:02,2009-11-26
163008,2279976427,Venugopi,Venu7630,,,,False,False,3,35,...,83,42,2014-01-07 03:50:17+00:00,False,https://pbs.twimg.com/profile_images/810515161...,7,1,2014,03:50:17,2014-01-07
163010,4890378807,𝔻𝕒𝕪𝕤𝕖 gab.ai/AvelarDayse,AvelarDayse,Brazil,,#JesusChrist❤︎︎ #QAnonPsyOp #StopZion࿕ #BanIsl...,False,False,3020,2306,...,29381,12704,2016-02-09 01:38:25+00:00,False,https://pbs.twimg.com/profile_images/117139421...,9,2,2016,01:38:25,2016-02-09
163014,872453029,R.A.shah,Rashah007,"Lucknow, India",,,False,False,881,1921,...,22823,9514,2012-10-10 20:23:59+00:00,True,https://pbs.twimg.com/profile_images/124794246...,10,10,2012,20:23:59,2012-10-10


In [12]:
# Save the files

user_dataframe.to_csv('user_data.csv', index=False)
tweet_dataframe.to_json('tweet_data.json', orient='records', lines=True)