In [390]:
import pandas as pd
import numpy as np
from bertopic import BERTopic #expects list of strings
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
import os

In [108]:
with open("../../data/climate_classified/climate_classified_posts_2.json") as f:
    df = pd.read_json(f)
df.head()
df_og = df


In [None]:
#generate data for testing
data = df[df["label"] == "yes"]
data = data[data["score"]>= 0.99]
data_text = data["text"]


In [4]:
sub_dfs = []
for i in np.arange(10000,110000,10000):
    sub_dfs.append(df_og[i-10000:i])
len(sub_dfs)

10

In [None]:
filtered_sub_dfs = []
for i in np.arange(5000,30000,5000):
    filtered_sub_dfs.append(filtered_df[i-5000:i])
len(filtered_sub_dfs)

5

In [65]:
# List of posts (strings)
climate_posts = filtered_sub_dfs[0]["text"]
# climate_posts = climate_posts[:3200]

In [None]:
# Load your Bluesky posts (climate-labeled ones)
# List of 120,000 posts (strings)
# Use a lightweight embedding model (can try 'all-MiniLM-L6-v2' first)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

umap_model = UMAP(n_neighbors=8, n_components=7, min_dist=0.02, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=8, prediction_data=True)

# Create BERTopic instance
topic_model = BERTopic(
    embedding_model=embedding_model, 
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True)


In [381]:

# Fit the model
topics, probs = topic_model.fit_transform(filtered_df_text)

# Visualize topics
# topic_model.visualize_topics()

2025-04-22 04:29:38,941 - BERTopic - Embedding - Transforming documents to embeddings.


Batches: 100%|██████████| 26/26 [00:28<00:00,  1.08s/it]
2025-04-22 04:30:07,125 - BERTopic - Embedding - Completed ✓
2025-04-22 04:30:07,127 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 04:30:10,692 - BERTopic - Dimensionality - Completed ✓
2025-04-22 04:30:10,695 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 04:30:10,802 - BERTopic - Cluster - Completed ✓
2025-04-22 04:30:10,833 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 04:30:10,973 - BERTopic - Representation - Completed ✓


In [382]:
topic_model.get_topic_info().head(100)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,811,0_the_to_and_of,"[the, to, and, of, in, is, it, that, for, are]",[update I forgot to post the other day I ended...
1,1,10,1_gorgeous_artsky_comdp0989433404_captivatesnyt,"[gorgeous, artsky, comdp0989433404, captivates...",[TINY-MIGHTY-TURTLE! Gorgeous watercolors Kirk...


In [389]:
# Create a DataFrame to explore the topics easily
df = pd.DataFrame({"text": filtered_df_text, "topic": topics})

# Filter to see texts assigned to topic 3
topic_3_texts = df[df["topic"] == 1]

# Show the first few
pd.set_option('display.max_colwidth', None)
print(topic_3_texts["text"].head(10))
pd.reset_option('display.max_colwidth')

3859     TINY-MIGHTY-TURTLE! Gorgeous watercolors Kirkus SweetHuffP CaptivatesNYT OnomatopoeiaPW Courageous SFGate TranquilALA SerenadesNatGeo amazon.comdp0989433404 kidlit writerslift books art nature ocean kids WritingCommunity BookSky ArtSky illustration climate turtles
9071     TINY-MIGHTY-TURTLE! Gorgeous watercolors Kirkus SweetHuffP CaptivatesNYT OnomatopoeiaPW Courageous SFGate TranquilALA SerenadesNatGeo amazon.comdp0989433404 kidlit writerslift books art nature ocean kids WritingCommunity BookSky ArtSky illustration climate turtles
12834    TINY-MIGHTY-TURTLE! Gorgeous watercolors Kirkus SweetHuffP CaptivatesNYT OnomatopoeiaPW Courageous SFGate TranquilALA SerenadesNatGeo amazon.comdp0989433404 kidlit writerslift books art nature ocean kids WritingCommunity BookSky ArtSky illustration climate turtles
16633    TINY-MIGHTY-TURTLE! Gorgeous watercolors Kirkus SweetHuffP CaptivatesNYT OnomatopoeiaPW Courageous SFGate TranquilALA SerenadesNatGeo amazon.comdp0989433404 kidlit write

In [None]:
#can reduce the toppic size down to nr_topics
topic_model.reduce_topics(filtered_df_text, nr_topics=6)
topic_model.get_topic_info().head(100)

ValueError: All arrays must be of the same length

In [391]:
os.getcwd()

'c:\\Users\\abels\\ITU\\BachProj\\DS_BachelorProject_PH\\notebooks\\Classification'

In [402]:
data_path = "../../data/climate_classified"

def prepare_data(df):
    data = df[df["label"] == "yes"]
    data = data[data["score"]>= 0.99]
    data_text = data["text"]
    return data, data_text

#generates BERT instance
def generate_model(neighbors, components, distance, cluster):
    # Use a lightweight embedding model (can try 'all-MiniLM-L6-v2' first)
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    umap_model = UMAP(n_neighbors=neighbors, n_components=components, min_dist=distance, metric='cosine', random_state=42)
    hdbscan_model = HDBSCAN(min_cluster_size=cluster, prediction_data=True)

    # Create BERTopic instance
    topic_model = BERTopic(
        embedding_model=embedding_model, 
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        verbose=True)
    return topic_model

def main():
    for filename in os.listdir(data_path):
        print(f"\nworking on: {filename}\n")
        df = pd.read_json(f"{data_path}/{filename}")
        filtered_df, filtered_text = prepare_data(df)
        topic_model = generate_model(neighbors=8,components=5,distance=0.01,cluster=8)
        topics, probs = topic_model.fit_transform(filtered_text)
        print(topic_model.get_topic_info())
    
if __name__ == "__main__":
    main()


working on: climate_classified_posts_113.json



2025-04-22 17:27:02,559 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 32/32 [00:29<00:00,  1.10it/s]
2025-04-22 17:27:31,654 - BERTopic - Embedding - Completed ✓
2025-04-22 17:27:31,655 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 17:27:35,993 - BERTopic - Dimensionality - Completed ✓
2025-04-22 17:27:35,994 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 17:27:36,072 - BERTopic - Cluster - Completed ✓
2025-04-22 17:27:36,076 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 17:27:36,211 - BERTopic - Representation - Completed ✓


    Topic  Count                                    Name  \
0      -1    380                        -1_the_and_to_in   
1       0     76                        0_of_to_the_they   
2       1     68                 1_climate_change_the_is   
3       2     42                2_climate_the_and_social   
4       3     35       3_weather_temperature_pressure_at   
5       4     31              4_credits_energy_tax_clean   
6       5     29                     5_coal_to_wind_cost   
7       6     25     6_shipping_carbon_international_the   
8       7     23                    7_apr_snow_precip_14   
9       8     22          8_fossil_gas_eusign2025_wemove   
10      9     21               9_could_vets_veteran_ohio   
11     10     20       10_schools_cars_transport_greener   
12     11     20         11_vote_canada_climate_election   
13     12     19                       12_ai_and_need_it   
14     13     18                   13_zero_net_bbc_steel   
15     14     15               14_china_

2025-04-22 17:27:39,794 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 32/32 [00:25<00:00,  1.26it/s]
2025-04-22 17:28:05,322 - BERTopic - Embedding - Completed ✓
2025-04-22 17:28:05,323 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 17:28:08,555 - BERTopic - Dimensionality - Completed ✓
2025-04-22 17:28:08,556 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 17:28:08,609 - BERTopic - Cluster - Completed ✓
2025-04-22 17:28:08,612 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 17:28:08,780 - BERTopic - Representation - Completed ✓


    Topic  Count                                           Name  \
0      -1    339                               -1_the_to_of_and   
1       0     56                      0_biodiversity_the_and_to   
2       1     53                        1_bsky_social_and_april   
3       2     44                       2_water_plastic_for_more   
4       3     42                      3_europe_climate_2024_the   
5       4     41                         4_climate_and_the_that   
6       5     30                                5_ai_it_and_use   
7       6     30                              6_co2_04_14_space   
8       7     27       7_trump_environmental_administration_epa   
9       8     25                          8_oil_prices_will_gas   
10      9     25                       9_weather_rain_0mph_wind   
11     10     25                          10_apr_snow_15_precip   
12     11     24                        11_electric_ev_evs_cars   
13     12     24        12_crisis_climate_bsky_apannierrunache

2025-04-22 17:28:11,595 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 17/17 [00:12<00:00,  1.40it/s]
2025-04-22 17:28:23,732 - BERTopic - Embedding - Completed ✓
2025-04-22 17:28:23,733 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 17:28:25,233 - BERTopic - Dimensionality - Completed ✓
2025-04-22 17:28:25,236 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 17:28:25,275 - BERTopic - Cluster - Completed ✓
2025-04-22 17:28:25,278 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 17:28:25,343 - BERTopic - Representation - Completed ✓


   Topic  Count                                             Name  \
0      0    531                                  0_the_to_and_of   
1      1     10  1_gorgeous_artsky_comdp0989433404_captivatesnyt   

                                      Representation  \
0     [the, to, and, of, is, in, that, it, for, are]   
1  [gorgeous, artsky, comdp0989433404, captivates...   

                                 Representative_Docs  
0  [Oh I agree. The old guard needs to go. But th...  
1  [TINY-MIGHTY-TURTLE! Gorgeous watercolors Kirk...  

working on: climate_classified_posts_20.json



2025-04-22 17:28:28,037 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 33/33 [00:28<00:00,  1.17it/s]
2025-04-22 17:28:56,260 - BERTopic - Embedding - Completed ✓
2025-04-22 17:28:56,261 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 17:29:01,458 - BERTopic - Dimensionality - Completed ✓
2025-04-22 17:29:01,460 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 17:29:01,570 - BERTopic - Cluster - Completed ✓
2025-04-22 17:29:01,573 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 17:29:01,655 - BERTopic - Representation - Completed ✓


   Topic  Count                        Name  \
0      0    958             0_the_to_and_of   
1      1     42      1_apr_snow_precip_high   
2      2     19  2_wind_weather_uv_humidity   
3      3     11       3_cars_our_season_car   

                                      Representation  \
0      [the, to, and, of, in, is, for, it, that, we]   
1  [apr, snow, precip, high, low, missing, iembot...   
2  [wind, weather, uv, humidity, rain, pressure, ...   
3  [cars, our, season, car, dry, quarters, poison...   

                                 Representative_Docs  
0  [Considering what is going on with the global ...  
1  [EUGENE OR Apr 7 Climate Report High 61 Low No...  
2  [Live weather for Yeovil on 8th Apr 2025 at 13...  
3  [CARS turn our living quarters to slowly killi...  

working on: climate_classified_posts_21.json



2025-04-22 17:29:04,441 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 30/30 [00:21<00:00,  1.38it/s]
2025-04-22 17:29:26,170 - BERTopic - Embedding - Completed ✓
2025-04-22 17:29:26,171 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 17:29:28,973 - BERTopic - Dimensionality - Completed ✓
2025-04-22 17:29:28,976 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 17:29:29,067 - BERTopic - Cluster - Completed ✓
2025-04-22 17:29:29,071 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 17:29:29,272 - BERTopic - Representation - Completed ✓


   Topic  Count                            Name  \
0     -1     17                 -1_04_07_or_the   
1      0    859                 0_the_to_and_of   
2      1     32  1_weather_temperature_the_wind   
3      2     19   2_3d_portfolioday_artist_work   
4      3     11   3_the_ocean_southern_feedback   
5      4     11           4_apr_snow_precip_low   

                                      Representation  \
0    [04, 07, or, the, en, are, co2, of, mass, your]   
1      [the, to, and, of, in, for, is, that, we, it]   
2  [weather, temperature, the, wind, is, april, p...   
3  [3d, portfolioday, artist, work, happy, and, e...   
4  [the, ocean, southern, feedback, ice, sea, in,...   
5  [apr, snow, precip, low, high, link, missing, ...   

                                 Representative_Docs  
0  [maybe but if everyone's COG goes up, then it ...  
1  [These arguments are deeply mirrored in the co...  
2  [Currently, at 1434, it's 14.9°C. Air pressure...  
3  [Hi Everyone, Happy Portfo

2025-04-22 17:29:32,272 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 20/20 [00:15<00:00,  1.29it/s]
2025-04-22 17:29:47,776 - BERTopic - Embedding - Completed ✓
2025-04-22 17:29:47,777 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 17:29:49,732 - BERTopic - Dimensionality - Completed ✓
2025-04-22 17:29:49,736 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 17:29:49,804 - BERTopic - Cluster - Completed ✓
2025-04-22 17:29:49,807 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 17:29:49,909 - BERTopic - Representation - Completed ✓


   Topic  Count                  Name  \
0      0    615       0_the_to_and_of   
1      1     12  1_apr_at_issues_fire   

                                      Representation  \
0  [the, to, and, of, in, is, it, that, for, clim...   
1  [apr, at, issues, fire, cdt, pm, link, 0000, i...   

                                 Representative_Docs  
0  [You say that like it’s a bad thing. They are ...  
1  [ICT issues Grassland Fire Danger RFD at Apr 6...  

working on: climate_classified_posts_40.json



2025-04-22 17:29:52,607 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 32/32 [00:26<00:00,  1.23it/s]
2025-04-22 17:30:18,670 - BERTopic - Embedding - Completed ✓
2025-04-22 17:30:18,675 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 17:30:23,470 - BERTopic - Dimensionality - Completed ✓
2025-04-22 17:30:23,473 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 17:30:23,536 - BERTopic - Cluster - Completed ✓
2025-04-22 17:30:23,540 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 17:30:23,668 - BERTopic - Representation - Completed ✓


    Topic  Count                                  Name  \
0      -1    374                      -1_the_and_to_of   
1       0     70             0_climate_that_change_the   
2       1     58            1_trump_laws_state_climate   
3       2     50            2_plastic_to_and_pollution   
4       3     44           3_years_warming_climate_the   
5       4     38              4_forests_to_forest_soil   
6       5     36       5_water_datacentres_driest_tech   
7       6     35                6_coal_clean_dirty_the   
8       7     35              7_coal_trump_industry_he   
9       8     31        8_energy_nuclear_renewable_the   
10      9     25               9_solar_magazine_pv_usa   
11     10     21             10_tariffs_energy_us_will   
12     11     19         11_weather_wind_pressure_0mph   
13     12     19             12_ucsusa_food_with_cente   
14     13     18                 13_methane_co2_the_o2   
15     14     17            14_hydrogen_acid_for_brake   
16     15     

2025-04-22 17:30:27,515 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 41/41 [00:27<00:00,  1.52it/s]
2025-04-22 17:30:54,559 - BERTopic - Embedding - Completed ✓
2025-04-22 17:30:54,560 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 17:30:59,202 - BERTopic - Dimensionality - Completed ✓
2025-04-22 17:30:59,203 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 17:30:59,323 - BERTopic - Cluster - Completed ✓
2025-04-22 17:30:59,327 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 17:30:59,457 - BERTopic - Representation - Completed ✓


    Topic  Count                               Name  \
0      -1    107            -1_snow_apr_precip_high   
1       0    819                    0_the_to_and_of   
2       1     70             1_snow_depth_apr_trace   
3       2     67         2_missing_link_precip_high   
4       3     67    3_apr_missing_iembot_additional   
5       4     30         4_airport_apr_missing_snow   
6       5     23              5_apr_precip_high_low   
7       6     22                6_snow_depth_apr_ne   
8       7     20              7_apr_precip_high_low   
9       8     20                8_snow_depth_wi_apr   
10      9     14                 9_pa_snow_27_depth   
11     10     12  10_wind_humidity_temperature_gust   
12     11     10       11_il_snow_depth_springfield   

                                       Representation  \
0   [snow, apr, precip, high, low, climate, depth,...   
1       [the, to, and, of, is, in, it, for, that, on]   
2   [snow, depth, apr, trace, precip, high, low, c...   
3

2025-04-22 17:31:02,919 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 28/28 [00:21<00:00,  1.33it/s]
2025-04-22 17:31:24,009 - BERTopic - Embedding - Completed ✓
2025-04-22 17:31:24,010 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 17:31:27,908 - BERTopic - Dimensionality - Completed ✓
2025-04-22 17:31:27,909 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 17:31:27,990 - BERTopic - Cluster - Completed ✓
2025-04-22 17:31:27,997 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 17:31:28,081 - BERTopic - Representation - Completed ✓


   Topic  Count                          Name  \
0      0    843               0_the_to_and_of   
1      1     17  1_energy_the_workers_employs   
2      2     11         2_apr_issues_link_spc   

                                      Representation  \
0  [the, to, and, of, in, is, it, for, climate, t...   
1  [energy, the, workers, employs, 35, in, percen...   
2  [apr, issues, link, spc, elevated, 1942z, fire...   

                                 Representative_Docs  
0  [Every four years, the US government produces ...  
1  [The energy industry employs 8.35 million work...  
2  [SPC issues Day 2 Elevated Fire Weather Risk a...  

working on: climate_classified_posts_67.json



2025-04-22 17:31:32,920 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 28/28 [00:31<00:00,  1.12s/it]
2025-04-22 17:32:04,251 - BERTopic - Embedding - Completed ✓
2025-04-22 17:32:04,252 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 17:32:07,072 - BERTopic - Dimensionality - Completed ✓
2025-04-22 17:32:07,073 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 17:32:07,118 - BERTopic - Cluster - Completed ✓
2025-04-22 17:32:07,122 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 17:32:07,223 - BERTopic - Representation - Completed ✓


    Topic  Count                                              Name  \
0      -1    262                                  -1_to_the_and_of   
1       0     90                    0_httpsarxiv_orgabs2504_of_the   
2       1     48                                  1_he_to_trump_is   
3       2     44                                2_save_now_our_the   
4       3     39                       3_the_promosky_great_nature   
5       4     37                              4_car_tesla_only_and   
6       5     28                            5_pollution_the_epa_in   
7       6     27              6_co2_greenhouse_polluters_emissions   
8       7     27                      7_weather_wind_rain_humidity   
9       8     25                              8_snow_apr_precip_10   
10      9     21            9_httpsarxiv_carbon_orgabs2504_07248v1   
11     10     21                                10_coal_no_is_than   
12     11     20                    11_climate_change_fear_history   
13     12     20    

2025-04-22 17:32:10,257 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 33/33 [00:21<00:00,  1.50it/s]
2025-04-22 17:32:32,241 - BERTopic - Embedding - Completed ✓
2025-04-22 17:32:32,242 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 17:32:36,596 - BERTopic - Dimensionality - Completed ✓
2025-04-22 17:32:36,597 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 17:32:36,656 - BERTopic - Cluster - Completed ✓
2025-04-22 17:32:36,661 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 17:32:36,758 - BERTopic - Representation - Completed ✓


   Topic  Count                                               Name  \
0     -1      1                        -1_eax_danger_rfd_grassland   
1      0    829                                    0_the_to_and_of   
2      1     69                            1_11_apr_missing_precip   
3      2     57                                2_snow_11_depth_apr   
4      3     25                              3_2000z_spc_risk_fire   
5      4     15  4_boycottpalmoil_boycott4wildlife_thegreenplan...   
6      5     10  5_keepitintheground_thegreenplanet_noplanetb_s...   
7      6     10                   6_snow_falls_international_depth   
8      7      9                        7_ozone_alert_april_quality   
9      8      9                     8_snow_apr_falls_international   

                                      Representation  \
0  [eax, danger, rfd, grassland, 320, pm, cdt, fi...   
1  [the, to, and, of, is, it, in, for, that, clim...   
2  [11, apr, missing, precip, high, low, snow, cl...   
3  [s

2025-04-22 17:32:39,914 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 26/26 [00:14<00:00,  1.74it/s]
2025-04-22 17:32:54,847 - BERTopic - Embedding - Completed ✓
2025-04-22 17:32:54,848 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 17:32:57,253 - BERTopic - Dimensionality - Completed ✓
2025-04-22 17:32:57,254 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 17:32:57,335 - BERTopic - Cluster - Completed ✓
2025-04-22 17:32:57,338 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 17:32:57,447 - BERTopic - Representation - Completed ✓


   Topic  Count                             Name  \
0      0    519                  0_the_to_of_and   
1      1    128         1_apr_missing_precip_low   
2      2    107        2_airport_apr_snow_precip   
3      3     60          3_snow_depth_apr_precip   
4      4      9  4_issues_excessive_043z_outlook   

                                      Representation  \
0      [the, to, of, and, is, in, it, that, for, on]   
1  [apr, missing, precip, low, high, snow, climat...   
2  [airport, apr, snow, precip, low, high, missin...   
3  [snow, depth, apr, precip, low, high, climate,...   
4  [issues, excessive, 043z, outlook, wpc, rainfa...   

                                 Representative_Docs  
0  [This is the future Pierre Poilievre, you're t...  
1  [BURNS OR Apr 6 Climate Report High 69 Low 26 ...  
2  [STOCKTON AIRPORT CA Apr 6 Climate Report High...  
3  [LAREDO Apr 6 Climate Report High 69 Low 50 Pr...  
4  [WPC issues Day 1 Moderate Risk Excessive Rain...  

working on: climate

2025-04-22 17:33:00,157 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 28/28 [00:16<00:00,  1.68it/s]
2025-04-22 17:33:16,866 - BERTopic - Embedding - Completed ✓
2025-04-22 17:33:16,867 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 17:33:20,185 - BERTopic - Dimensionality - Completed ✓
2025-04-22 17:33:20,186 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 17:33:20,236 - BERTopic - Cluster - Completed ✓
2025-04-22 17:33:20,238 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 17:33:20,338 - BERTopic - Representation - Completed ✓


    Topic  Count                              Name  \
0      -1     19                 -1_link_pm_apr_12   
1       0    529                   0_the_to_and_of   
2       1    108             1_airport_apr_12_snow   
3       2     64               2_snow_depth_apr_12   
4       3     60        3_link_missing_precip_high   
5       4     41                 4_apr_sun_13_0000   
6       5     24                 5_sat_12_apr_0000   
7       6     14        6_wind_is_pressure_weather   
8       7     11       7_trace_link_missing_precip   
9       8     11             8_1473_sol_mean_local   
10      9     10  9_avoidance_hazard_camera_hazcam   

                                       Representation  \
0   [link, pm, apr, 12, issues, az, grassland, mes...   
1      [the, to, and, of, it, in, is, for, that, you]   
2   [airport, apr, 12, snow, precip, high, low, mi...   
3   [snow, depth, apr, 12, precip, high, low, clim...   
4   [link, missing, precip, high, low, 12, snow, a...   
5   [apr,

2025-04-22 17:33:22,337 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 3/3 [00:02<00:00,  1.34it/s]
2025-04-22 17:33:24,581 - BERTopic - Embedding - Completed ✓
2025-04-22 17:33:24,582 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 17:33:24,831 - BERTopic - Dimensionality - Completed ✓
2025-04-22 17:33:24,832 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 17:33:24,836 - BERTopic - Cluster - Completed ✓
2025-04-22 17:33:24,839 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 17:33:24,869 - BERTopic - Representation - Completed ✓


   Topic  Count                    Name  \
0     -1     27   -1_the_to_climate_and   
1      0     47         0_the_to_and_in   
2      1     10  1_you_to_huge_negative   

                                      Representation  \
0  [the, to, climate, and, in, about, with, that,...   
1      [the, to, and, in, of, is, it, that, for, we]   
2  [you, to, huge, negative, carbon, my, no, for,...   

                                 Representative_Docs  
0  [Climate Scale will attend the WindEurope Annu...  
1  [This is a good point by sabrinafernandes.bsky...  
2  [I make no apologies for my huge carbon footpr...  


In [None]:
original_df topics


array([1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.     