In [1]:
import json
import pandas as pd
import ast

In [2]:
# Load the JSON data
with open("corona-out-3", "r") as f1:
    data = []
    for line in f1:
        try:
            item = json.loads(line)
            data.append(item)
            
        except:
            # if there is an error loading the json of the tweet, skip
            continue

In [3]:
# Parse the relevant fields from each tweet
tweets = []
for tweet in data:
    user = tweet['user']['screen_name']
    date = tweet['created_at']
    text = tweet['text']
    urls = [url['url'] for url in tweet['entities']['urls']]
    hashtags = [tag['text'] for tag in tweet['entities']['hashtags']]
    mentions = [mention['screen_name'] for mention in tweet['entities']['user_mentions']]
    tweets.append({'user': user, 'date': date, 'text': text, 'urls': urls, 'hashtags': hashtags, 'mentions': mentions})


In [4]:
# Create a pandas DataFrame from the parsed data
df = pd.DataFrame(tweets)

In [5]:
# Print the total number of tweets collected
print('Total number of tweets:', len(df))

# Print the number of unique users posting tweets
print('Number of unique users:', df['user'].nunique())

Total number of tweets: 101916
Number of unique users: 80953


In [6]:
# Identify the top hashtags
top_hashtags = df.explode('hashtags')['hashtags'].value_counts().head(10)
print('Top hashtags:\n', top_hashtags)

Top hashtags:
 Corona                 4582
Mattarella             1506
25Aprile               1472
corona                 1449
Covid_19                973
AltaredellaPatria       805
PideAlmayaDiyeÇıkıp     776
COVID19                 764
Liberazione             696
coronavirus             629
Name: hashtags, dtype: int64


In [7]:
# Identify the most frequent URLs
most_frequent_urls = df.explode('urls')['urls'].value_counts().head(10)
print('Most frequent URLs:\n', most_frequent_urls)

Most frequent URLs:
 https://t.co/YPcJXU1uqw    1474
https://t.co/fGIsLKzTkm     988
https://t.co/YKpVaB5ZMQ     294
https://t.co/pZ7OSbu0V1     178
https://t.co/WKkHMq5OtK     150
https://t.co/VLkciV1L8z     143
https://t.co/h8S1NnGQUC     142
https://t.co/uuUP7cI73J     134
https://t.co/fx17qyhqrX     131
https://t.co/n3c9AGDM5m     110
Name: urls, dtype: int64


In [8]:
# Identify the most prolific users
most_prolific_users = df['user'].value_counts().head(10)
print('Most prolific users:\n', most_prolific_users)

Most prolific users:
 trendy1517        295
Kenkendall19      271
ELister_social    131
Murat58939472      85
matomecorona       75
CoronaScanner      62
BabaMkwe           61
EricGavara         52
RanveerLawyer      49
News__Poster       44
Name: user, dtype: int64


In [9]:
# Print basic stats on word counts
print('Basic stats on word counts:\n', df['text'].str.split().apply(len).describe())

Basic stats on word counts:
 count    101916.000000
mean         16.951803
std           6.089937
min           1.000000
25%          13.000000
50%          18.000000
75%          21.000000
max          38.000000
Name: text, dtype: float64
