### Installs

In [162]:
pip install bertopic==0.9.3

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


### Imports

In [534]:
from bertopic import BERTopic
from tqdm import tqdm
import random
import os.path
import pandas as pd
import pickle as pkl
import multiprocessing as mp
import re
import emoji
import plotly.express as px
from datetime import date, timedelta
import plotly.graph_objects as go
from plotly.subplots import make_subplots

### Constants

In [583]:
TEXTS_PATH = '/mlodata1/prakhar/twitter_covid_insights/insights_All/texts.txt'
TWEETS_SAMPLE = '/mlodata1/raileanu/tweets_sample.txt'
MODEL_PATH = '/mlodata1/raileanu/bertopic_model_100k'
REDUCED_MODEL_PATH = '/mlodata1/raileanu/reduced_bertopic_model_100k'
TWEETS_PIPED_PATH = "/mlodata1/prakhar/all_available_tweets_piped"
TWEETS_PATH = "/mlodata1/prakhar/all_available_tweets"
CLUSTERED_TWEETS_PATH = "/mlodata1/raileanu/clustered_tweets"
CLUSTERED_SAMPLED_TWEETS_PATH = "/mlodata1/raileanu/clustered_sampled_tweets"
BERT_TWEETS_TOPICS = "/mlodata1/raileanu/bert_topic_files"
BERT_ALL_TOPICS = "/mlodata1/raileanu/bert_all_topics.pkl"
CLUSTERED_TOPIC_TRENDS = "/mlodata1/raileanu/bert_clusters_trends.pkl"
NORMALIZED_CLUSTERED_TOPIC_TRENDS = "/mlodata1/raileanu/normalized_bert_clusters_trends.pkl"
PLOTS_PATH = "/mlodata1/raileanu/Topics_Visualization/paper_code/visualizations/plots/tweet_topic_trends"

### Random Sample

In [135]:
def get_tweets_random_sample(file, n_lines):
    it = iter(file)
    try:
        result = [next(it) for _ in range(n_lines)]
    except StopIteration:
        raise ValueError("Sample larger than population")

    for i, item in tqdm(enumerate(it, start=n_lines)):
        s = random.randint(0, i)
        if s < n_lines:
            result[s] = item
    return result

In [136]:
if not os.path.isfile(TWEETS_SAMPLE):
    tweets_sample_file = open(TWEETS_SAMPLE, 'w')
    with open(TEXTS_PATH) as file:
        for line in get_tweets_random_sample(file, 100000):
            tweets_sample_file.write(line)

In [137]:
lines = []
with open(TWEETS_SAMPLE) as file:
    lines = file.read().splitlines()

### Fit

In [147]:
if not os.path.isfile(MODEL_PATH):
    model = BERTopic(verbose=True).fit(lines)
    model.save(MODEL_PATH)
else:
    model = BERTopic(verbose=True).load(MODEL_PATH)

In [148]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,43864,-1_she_her_health_care
1,0,13897,0_yahoo_gt_pcr_whatshappeninginmyanmar
2,1,1251,1_vaccinated_vaccines_vaccination_choice
3,2,1221,2_mascarilla_cuarentena_pero_vacuna
4,3,999,3_coronavrus_nuevos_italia_nuevo
...,...,...,...
439,445,10,445_french_loisirs_mirrors_enormissime
437,447,10,447_loans_loan_mortgages_grants
436,448,10,448_wtt_sayinnn_200_02
435,442,10,442_vacinas_sepultura_ctnbiomcti_esperanadia


### Reduce topics

In [459]:
if not os.path.isfile(REDUCED_MODEL_PATH):
    transformed_topics, _ = model.transform(lines)
    reduced_topics = model.reduce_topics(lines, transformed_topics, nr_topics=100)
    model.save(REDUCED_MODEL_PATH)
else:
    model = BERTopic(verbose=True).load(REDUCED_MODEL_PATH)

In [159]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,44196,-1_we_your_nt_vaccine
1,0,14038,0_yahoo_gt_pcr_remote
2,1,1482,1_vaccine_vaccinated_vaccines_vaccination
3,2,1254,2_el_los_es_que
4,3,1164,3_coronavirus_coronavrus_por_los
...,...,...,...
96,95,166,95_biden_pence_trump_president
97,96,166,96_lockdown_going_back_go
98,97,163,97_19_covid19_covid19greece_covid19gr
99,98,163,98_tweet_twitter_retweet_tweets


In [326]:
topic_info_df = model.get_topic_info()
topic_id_to_name = pd.Series(topic_info_df['Name'].values, index=topic_info_df['Topic']).to_dict()
topic_name_to_id = {v: k for k, v in topic_id_to_name.items()}

### Transform

In [None]:
if len(os.listdir(TWEETS_PIPED_PATH)) == 0:
    for file in tqdm(os.listdir(TWEETS_PIPED_PATH)):
        piped_tweets = pd.read_parquet(os.path.join(TWEETS_PIPED_PATH, file))
        topics, _ = model.transform(piped_tweets['cleaned_text'].values.tolist())
        piped_tweets['topic_id'] = topics
        piped_tweets['topic_name'] = piped_tweets['topic_id'].apply(lambda x: topic_id_to_name[x])

        # Save predicted topics 
        save_path = os.path.join(CLUSTERED_TWEETS_PATH, file)
        piped_tweets.to_parquet(save_path)

### Cluster trends

First, we need to find out for which dates we have available topics and impute the rest.

In [536]:
start_date = os.listdir(CLUSTERED_SAMPLED_TWEETS_PATH)[0].replace('.parquet', '').replace('parsed_', '')
end_date = os.listdir(CLUSTERED_SAMPLED_TWEETS_PATH)[-1].replace('.parquet', '').replace('parsed_', '')

In [537]:
start_date

'2019-12-31'

In [538]:
end_date

'2021-09-30'

In [539]:
dates = pd.date_range(date(2019, 12, 31), date(2021, 10, 1) - timedelta(days=1), freq='d')

In [540]:
dates

DatetimeIndex(['2019-12-31', '2020-01-01', '2020-01-02', '2020-01-03',
               '2020-01-04', '2020-01-05', '2020-01-06', '2020-01-07',
               '2020-01-08', '2020-01-09',
               ...
               '2021-09-21', '2021-09-22', '2021-09-23', '2021-09-24',
               '2021-09-25', '2021-09-26', '2021-09-27', '2021-09-28',
               '2021-09-29', '2021-09-30'],
              dtype='datetime64[ns]', length=640, freq='D')

In [541]:
if not os.path.isfile(CLUSTERED_TOPIC_TRENDS):
    trends = {i: [0] * len(dates) for i in range(-1, 101)}
    normalized_trends = {i: [0] * len(dates) for i in range(-1, 101)}
    for file in tqdm(os.listdir(CLUSTERED_SAMPLED_TWEETS_PATH)):
        clustered_tweets = pd.read_parquet(os.path.join(CLUSTERED_SAMPLED_TWEETS_PATH, file))
        if len(clustered_tweets) != 0:
            clustered_tweets = clustered_tweets.groupby('topic_id', as_index=False)['hashtags'].count()
            all_topics_count = clustered_tweets['hashtags'].sum()
            date_loc = dates.get_loc(file.replace('.parquet', '').replace('parsed_', ''))
            for index, row in clustered_tweets.iterrows():
                trends[row['topic_id']][date_loc] = row['hashtags']
                normalized_trends[row['topic_id']][date_loc] = row['hashtags'] / all_topics_count
    with open(CLUSTERED_TOPIC_TRENDS, 'wb') as f:
        pkl.dump(trends, f)
    with open(NORMALIZED_CLUSTERED_TOPIC_TRENDS, 'wb') as f:
        pkl.dump(normalized_trends, f)
else:
    with open(CLUSTERED_TOPIC_TRENDS, 'rb') as f:
        trends = pkl.load(f)
    with open(NORMALIZED_CLUSTERED_TOPIC_TRENDS, 'rb') as f:
        trends = pkl.load(f)

### Topics Visualization

In [542]:
model.visualize_topics()

In [543]:
model.visualize_barchart()

In [544]:
model.visualize_heatmap()

In [545]:
model.visualize_term_rank()

### Most common topics

In [584]:
def print_topic_tweets(topic_id, nb_tweets):
    """
    A helper function that prints the content of nb_tweets tweets that are part of a given topic.
    """
    for file in os.listdir(CLUSTERED_SAMPLED_TWEETS_PATH):
        clustered_tweets = pd.read_parquet(os.path.join(CLUSTERED_SAMPLED_TWEETS_PATH, file))
        clustered_tweets = clustered_tweets[clustered_tweets['topic_id'] == topic_id]
        for index, row in clustered_tweets.iterrows():   
            if nb_tweets > 0:
                print(row['cleaned_text'] + '\n')
            nb_tweets -= 1
        if nb_tweets == 0:
            break

In [585]:
topic_counts = {}
for topic in range(-1, 100):
    # Count total number of occurrences
    topic_counts[topic] = sum(trends[topic])
    
# Get the top 20 most common topics
topic_counts = dict(sorted(topic_counts.items(), key=lambda item: -item[1]))
top_20_topics = list(topic_counts.items())[:21]
top_20_topics = [(topic_id, topic_id_to_name[topic_id], cnt) for (topic_id, cnt) in top_20_topics]
top_20_topics

[(-1, '-1_we_your_nt_vaccine', 290.95311954088936),
 (0, '0_yahoo_gt_pcr_remote', 55.541359683570704),
 (4, '4_china_chinese_coronavirus_outbreak', 13.44947443709286),
 (3, '3_coronavirus_coronavrus_por_los', 8.893184185668702),
 (11, '11_thread_here_louder_thanks', 7.372036257867939),
 (15, '15_here_contact_tracing_numbers', 6.051618313445327),
 (6, '6_schools_school_students_education', 5.557209367080795),
 (5, '5_fuck_lol_love_omg', 5.129921335061226),
 (1, '1_vaccine_vaccinated_vaccines_vaccination', 5.0700756541560015),
 (16, '16_via_covid19_5g_video', 5.0052402995517795),
 (8, '8_el_los_vacunacin_vacunas', 4.462687186854079),
 (26, '26_refund_coronavirus_theaters_cancelled', 3.947978205355962),
 (2, '2_el_los_es_que', 3.9023312192140884),
 (10, '10_pandemic_birthday_wave_during', 3.838254882161022),
 (12, '12_gon_he_covid_get', 3.812547710739271),
 (17, '17_sports_games_teams_nfl', 3.600683410642715),
 (9, '9_und_der_nicht_ich', 3.3222841625873207),
 (14, '14_pandemic_distancing_

In [626]:
normalized_trends

{-1: [0.6216216216216216,
  0.5,
  0.6060606060606061,
  0.5789473684210527,
  0.43333333333333335,
  0.2962962962962963,
  0.5333333333333333,
  0.6666666666666666,
  0.7058823529411765,
  0.4,
  0.4827586206896552,
  0.48484848484848486,
  0.5,
  0.5217391304347826,
  0.38064516129032255,
  0.41843971631205673,
  0.4434782608695652,
  0.42443729903536975,
  0.3890784982935154,
  0.36823104693140796,
  0.412751677852349,
  0.43984900705727886,
  0.4413135883724119,
  0.4482228373094328,
  0.46729101150775826,
  0.491057803821034,
  0.49201023221819873,
  0.4791774779449922,
  0.4767611504793664,
  0.48022071540340955,
  0.5086183730661445,
  0.5027311236623068,
  0.48086651193239305,
  0.48256399304390274,
  0.47356497984475465,
  0.4721674611993261,
  0.47681459644649826,
  0.4862008454973279,
  0.4804175612645295,
  0.47788431797092484,
  0.47995509192122376,
  0.4862396542585722,
  0.49587592249102175,
  0.49997816117056126,
  0.48821717045549173,
  0.5009486637393614,
  0.49735915

In [613]:
# Label the most relevant topics out of the 20 most common
topic_labels = {
    1: 'Vaccine',
    6: 'Education',
    4: 'China outbreak',
    23: 'Mask wearing',
    5: 'Internet slang',
    16: 'COVID-19 conspiracies',
    10: 'COVID-19 in Airports',
    12: 'COVID-19 fear and hope',
    20: 'COVID-19 emojis',
    19: 'Lockdown',
    7: 'Latin America COVID-19',
    17: 'Sports',
    15: 'Contact Tracing',
    42: 'Deaths'
}

### Epidemiological & Twitter trends Analysis

In [587]:
# Read epi for US
epi_df = pd.read_csv('../Data/epi_data.csv', low_memory=False)
epi_df = epi_df[epi_df.iso_code == 'USA']
country_dates = epi_df['date'].values

# Overlapping Dates
overlapping_dates = sorted(list(set([str(d.date()) for d in dates]) & set(country_dates)))
epi_df = epi_df[epi_df['date'].isin(overlapping_dates)]

In [588]:
print(epi_df.columns.tolist())

['date', 'name', 'country_id', 'cases', 'deaths', 'recovered', 'new_cases', 'new_deaths', 'total_population_all', 'total_population_below_1', 'total_population_1_to_4', 'total_population_5_to_14', 'total_population_15_to_24', 'total_population_25_to_34', 'total_population_35_to_54', 'total_population_55_to_74', 'total_population_above_75', 'population_percentage_over_60', 'hospital_beds_per_1000', 'physicians_per_1000', 'nurses_per_1000', 'fatality_rate_percent', 'cases_per_100000', 'deaths_per_100000', 'cases_per_1000_hospital_beds', 'deaths_per_1000_hospital_beds', 'cases_per_1000_nurses', 'deaths_per_1000_nurses', 'cases_per_1000_physicians', 'deaths_per_1000_physicians', 'days_since_outbreak', 'new_cases_percent_increase', 'avg_3d_new_cases_percent_increase', 'unemployment_rate_2019_03_31', 'unemployment_rate_2019_06_30', 'unemployment_rate_2019_09_30', 'unemployment_rate_2019_12_31', 'unemployment_rate_2020_03_31', 'unemployment_rate_2020_06_30', 'unemployment_rate_2020_09_30', 'u

In [614]:
def plot_epi_topic_trends(epi_trend, topic_id, dates, overlapping_dates, model):
    """
    Plots the trends of a topic in relation to epidemiological trends
    """
    
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    
    # Get representative words for the current topic
    topic_words = ", ".join([word for word, prob in model.get_topic(topic_id)])
    
    # Plot topic data
    fig.add_trace(
        go.Scatter(
            x=overlapping_dates,
            y=normalized_trends[topic_id][
                dates.get_loc(overlapping_dates[0]) : dates.get_loc(overlapping_dates[-1])
            ],
            marker_color="#47AEFF",
            mode="lines",
            name=f"{topic_labels[topic_id]} Topic",
        ),
        secondary_y=False,
    )
    
    # Plot epi data
    fig.add_trace(go.Scatter(x=overlapping_dates,
                             y=epi_df[epi_trend],
                        mode='lines',
                        name=f'{epi_trend.replace("_", " ").title()}',
                        marker_color="#FF0016"),
                  secondary_y=True)

    # Add figure title
    fig.update_layout(title_text=f"{epi_trend.replace('_', ' ').title()} Trend " \
                      f"and {topic_labels[topic_id]} Topic Trend <br>" \
                      f"<sup>Topic Words: {topic_words}</sup>")

    # Set x-axis title
    fig.update_xaxes(title_text="Date")

    # Set y-axes titles
    fig.update_yaxes(title_text=f"<b>{topic_labels[topic_id]} Topic Trend</b>", secondary_y=False, color="#47AEFF")
    fig.update_yaxes(title_text=f"<b>{epi_trend.replace('_', ' ').title()} Trend</b>", secondary_y=True, color="#FF0016")

    fig.show()
    fig.write_html(os.path.join(PLOTS_PATH, f"{epi_trend}_vs_{topic_labels[topic_id]}.html"))

In [615]:
plot_epi_topic_trends('new_cases', 1, dates, overlapping_dates, model)

In [616]:
plot_epi_topic_trends('daily_vaccinations', 1, dates, overlapping_dates, model)

In [617]:
plot_epi_topic_trends('has_implemented_schools_closure', 6, dates, overlapping_dates, model)

In [618]:
plot_epi_topic_trends('has_implemented_flights_suspension', 10, dates, overlapping_dates, model)

In [619]:
plot_epi_topic_trends('travel_risk_score', 10, dates, overlapping_dates, model)

In [620]:
plot_epi_topic_trends('stay_at_home_requirements', 10, dates, overlapping_dates, model)

In [621]:
plot_epi_topic_trends('daily_vaccinations', 12, dates, overlapping_dates, model)

In [622]:
plot_epi_topic_trends('mobility_google_index', 10, dates, overlapping_dates, model)

In [623]:
plot_epi_topic_trends('mobility_google_index', 12, dates, overlapping_dates, model)

In [624]:
plot_epi_topic_trends('new_deaths', 42, dates, overlapping_dates, model)

In [625]:
plot_epi_topic_trends('days_since_outbreak', 4, dates, overlapping_dates, model)

In [None]:
plot_epi_topic_trends('new_cases', 12, dates, overlapping_dates, model)