In [2]:
# original json file like
# {"name": "John", "age": 31, "city": "New York"}
# {"name": "John", "age": 31, "city": "New York"}
# want to transform to standard json file like
# [{"name": "John", "age": 31, "city": "New York"},{"name": "John", "age": 31, "city": "New York"}]

import json
import os


def processjson(filename, outpath="../usedata/"):
    with open(filename, "r") as f:
        lines = f.readlines()
    # Parse each line as a JSON object and add to a list
    data = [json.loads(line) for line in lines]
    print(len(data))
    # if outpath not exist, create it
    os.makedirs(outpath, exist_ok=True)
    # Write the list of JSON objects to a new file
    with open(outpath + filename, "w") as f:
        json.dump(data, f)


datapath = "./data/"
dataname = "2020-10-2%d.json"
day = range(6)

for i in day:
    filename = dataname % i
    # processjson(datapath + filename)

In [3]:
usedatapath = "./usedata/"
# load data
with open(usedatapath + dataname % 0, "r", encoding="utf-8") as file:
    data = json.load(file)
print(len(data))

# transform data to pandas df
import pandas as pd

df = pd.DataFrame(data[:1000])
print(df.columns)

72875
Index(['id', 'conversation_id', 'created_at', 'date', 'time', 'timezone',
       'user_id', 'username', 'name', 'place', 'tweet', 'language', 'mentions',
       'urls', 'photos', 'replies_count', 'retweets_count', 'likes_count',
       'hashtags', 'cashtags', 'link', 'retweet', 'quote_url', 'video',
       'thumbnail', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
       'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
       'trans_dest'],
      dtype='object')


In [3]:
# print number of unique values in each column
idx = []
for col in df.columns:
    try:
        print(col, df[col].nunique())
        if df[col].nunique() == 0:
            idx.append(col)
    except TypeError:
        print(col, "TypeError")

# drop columns with no unique values
df.drop(idx, axis=1, inplace=True)

id 72875
conversation_id 60365
created_at 47102
date 2
time 47102
timezone 1
user_id 42304
username 42304
name 40643
place TypeError
tweet 70080
language 54
mentions TypeError
urls TypeError
photos TypeError
replies_count 106
retweets_count 271
likes_count 442
hashtags TypeError
cashtags TypeError
link 72875
retweet 1
quote_url 9408
video 2
thumbnail 15339
near 1
geo 1
source 1
user_rt_id 1
user_rt 1
retweet_id 1
reply_to TypeError
retweet_date 1
translate 1
trans_src 1
trans_dest 1


重要信息: `id`,`time`,`user_id`,`username`,`tweet`,`language`
可用以评价 tweet 影响力的信息：`replies_cout`,`retweets_count`,`likes_count`
可判断 tweet 话题的信息：`hashtags`,`mentions`


In [4]:
df["language"].value_counts()

language
en     804
und    144
es      22
fr      13
pt       7
tl       4
fa       2
de       2
in       1
it       1
Name: count, dtype: int64

主要语言为英语，为方便处理，将其他语言的 tweet 翻译为英语，只保留出现频次大于 100 的语言


In [5]:
# keep rows with frequency of corresponding language > 100
language = df["language"].value_counts() >= 100
language = language[language].index.tolist()
df = df[df["language"].isin(language)]
print(df["language"].value_counts())

language
en     58020
und     9983
es      1055
fr       897
de       673
it       379
pt       253
tl       223
fa       203
nl       192
hi       161
Name: count, dtype: int64


In [6]:
# translate all non-en tweets to english
import googletrans
from googletrans import Translator

translator = Translator()


def translate(language, tweet):
    if language != "en":
        try:
            tweet = translator.translate(tweet).text
        except:
            tweet = ""
    return tweet


# df["tweet"] = df.apply(lambda x: translate(x["language"], x["tweet"]), axis=1)

In [6]:
# keep rows with en tweets
df_en = df[df["language"] == "en"].copy()
df_en.drop(["language"], axis=1, inplace=True)
df_en.shape

(804, 35)

In [10]:
# clean tweets
import re
def clean_tweets(text):
  text = re.sub("RT @[\w]*:","",text)
  text = re.sub("@[\w]*","",text)
  text = re.sub("https?://[A-Za-z0-9./]*","",text)
  text = re.sub("\n","",text)
  return text

df_en["tweet"] = df_en["tweet"].apply(lambda x: clean_tweets(x))

In [12]:
# add a new tag column to whether the tweet have string "biden" or "trump"
# 0 for none, 1 for biden, 2 for trump, 3 for both
def add_tag(text):
  if "biden" in text.lower():
    if "trump" in text.lower():
      return 3
    else:
      return 1
  elif "trump" in text.lower():
    return 2
  else:
    return 0

df_en["tag"] = df_en["tweet"].apply(lambda x: add_tag(x))
df_en["tag"].value_counts()

tag
2    297
1    263
3    144
0    100
Name: count, dtype: int64

In [8]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download the vader_lexicon
# nltk.download('vader_lexicon')

# Create a SentimentIntensityAnalyzer object
sia = SentimentIntensityAnalyzer()


# Create a function to get the polarity scores of each tweet
def get_polarity_scores(tweet):
    return sia.polarity_scores(tweet)


# add polarity scores to df
df_en["polarity_scores"] = df_en["tweet"].apply(get_polarity_scores)

In [None]:
# plot a word cloud
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

def plot_wordcloud(tag_list):
    # select tweets with tag in tag_list
    text = " ".join(df_en[df_en["tag"].isin(tag_list)]["tweet"])
    # Create a wordcloud object
    stopwords = set(STOPWORDS)
    wordcloud = WordCloud(
        width=800,
        height=800,
        background_color="white",
        stopwords=stopwords,
        min_font_size=10,
    ).generate(text)
    
    # plot the wordcloud object
    plt.figure(figsize=(8, 8), facecolor=None)
    # turn off the axis
    plt.axis("off")
    
    plt.imshow(wordcloud)

plot_wordcloud([2])

In [None]:
# build a network from the tweets

import igraph as ig

# G = ig.Graph.TupleList(tuples.itertuples(index=False), 
#                            directed=True, 
#                            weights=False,
#                            edge_attrs=['tweetid','timestamp']
#                            ) 




In [18]:
%pip install bertopic

Collecting bertopic
  Using cached bertopic-0.15.0-py2.py3-none-any.whl (143 kB)
Processing /home/twh/.cache/pip/wheels/5e/6f/8c/d88aec621f3f542d26fac0342bef5e693335d125f4e54aeffe/sentence_transformers-2.2.2-py3-none-any.whl
Processing /home/twh/.cache/pip/wheels/72/93/36/3c7c74a6f2127e71810a0e0f535955175556a434aec55de679/hdbscan-0.8.33-cp38-cp38-linux_x86_64.whl
Collecting torch>=1.6.0
  Using cached torch-2.1.0-cp38-cp38-manylinux1_x86_64.whl (670.2 MB)
Collecting torchvision
  Using cached torchvision-0.16.0-cp38-cp38-manylinux1_x86_64.whl (6.9 MB)
Collecting transformers<5.0.0,>=4.6.0
  Using cached transformers-4.35.0-py3-none-any.whl (7.9 MB)
Collecting cython<3,>=0.27
  Using cached Cython-0.29.36-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (1.9 MB)
Collecting nvidia-nvtx-cu12==12.1.105; platform_system == "Linux" and platform_machine == "x86_64"
  Using cached nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)
Collecting sympy
  

ERROR: unknown command "cache" - maybe you meant "check"
Note: you may need to restart the kernel to use updated packages.
