In [1]:
import pandas as pd
import os
import json

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)  # For older pandas
pd.set_option('display.max_colwidth', None)  # For newer pandas

In [3]:
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
pd.reset_option('display.width')
pd.reset_option('display.max_colwidth')


In [4]:
#Loading the entire output of BERTopic
df_whole = pd.read_json("../../data/BERTopicResult/BERTopic_final_result_clean.json",lines=True)
df_whole.shape

(82269, 3)

In [5]:
path = "../../data/llm_subtopic/multi_label/larger_sample_size"
all_rows = []

for filename in os.listdir(path):
    if not filename.endswith(".json"):
        continue
    df = pd.read_json(f"{path}/{filename}")
    
    for idx, row in df.iterrows():
        topic = row["topic"]
        
        sample = row["samples"]
        # Add topic to each sample dict
        sample["topic"] = topic
        all_rows.append(sample)
df_prior = pd.DataFrame(all_rows)    


#split label predictions into list of strings
df_prior["gpt_label"] = df_prior["gpt_label"].apply(lambda x: [label.strip() for label in x.split(",")])

#separate label prediction into additional rows in the df
df_labels = df_prior.explode("gpt_label").reset_index(drop=True)

df_labels.shape

(24755, 4)

In [6]:
df_prior.head(5)
# df_prior[df_prior.topic == 1045].head().gpt_label.iloc[0]
# df_prior.shape

Unnamed: 0,cid,text,gpt_label,topic
0,bafyreihxfqbtq23vwj2mldk6huwkovfyq7yzibzf7wsv3...,President Trump Trying To Remove 150 Years Of ...,"[Politics, Activism]",0
1,bafyreieqygyajo3rsx4ntye5jj4kyxxl33rzfhcqqjki6...,2 Fix the real shit that is broken. Like minim...,"[Activism, Politics]",0
2,bafyreiasdqzid6rvbkvwzkeuwrwfqyxbdno4n6kom5gii...,Today is 7years after Latour wrote his book. C...,"[Politics, Disaster]",0
3,bafyreid7ok2wsoe3wlzswdf246drcasvrirux46uaoabv...,Deb Fischer might be one of the stupidest peop...,"[Politics, Activism]",0
4,bafyreiexy54ja7upnu4ahbxj4ggjdhiet3e3iglc6y4ka...,Join me in following myzerocarbon.org! The cur...,"[Activism, Politics]",0


In [7]:
df_labels.head()
df_labels.shape

(24755, 4)

In [8]:
df_whole.head()

Unnamed: 0,cid,text,topic
0,bafyreihmqtib6dm76dgrnpiz5gdbzzcnskqpjovbi3owy...,Thank you hayleysmith.bsky.social latimes.com ...,7
1,bafyreic5q4z3jyudbkjj24t433h3giy6oicy6rv2ih4vz...,Can't wait for his demise. Never ever have I w...,0
2,bafyreibscp2vq5za5kbs2hw64jxpsu7of3jst2mldlwgl...,What aspect of it is art But also please answe...,3
3,bafyreiglnfw2lfy6msf7n3cx4bouno75znasu4hrgnnrt...,ClimateChanger denier at my Hurricanes demo st...,0
4,bafyreiaeny4uerr5osifz7fjsqwivnv7n7d7dpw5svja2...,Day 56 of trying to work with nature to reduce...,9


In [9]:
#get the counts, create final df
df_final = pd.DataFrame()
df_gpt_label = pd.DataFrame()
df_gpt_label_multi = pd.DataFrame()


for topic in df_labels.topic.unique():
    df_topic = df_prior[df_prior.topic==topic]
    n = int(df_topic.shape[0]/2)
    df_topic = df_labels[df_labels.topic==topic]
    print(f"topic: {topic}")
    print(f"n: {n}")
    
    label_counts = df_topic.groupby(["topic", "gpt_label"]).size().reset_index(name="count")
    # print(f"counts:\n{label_counts}")
    filtered = label_counts[label_counts["count"] > n]
    if not filtered.empty:
        for i in range(filtered.shape[0]):
            row = filtered.iloc[i]
            label = row["gpt_label"]
            # print(f"label to predict: {label}")
            df_add = df_whole[df_whole["topic"] == topic].copy()
            df_add["predicted_label"] = label

            
            label_count = label_counts[(label_counts["gpt_label"] == label) & (label_counts["topic"] == topic)]["count"].values[0]
            df_add["weight"] = round(label_count / (n*2), 3)
            df_final = pd.concat([df_final, df_add], ignore_index=True)
            # print(f"above half:\n{filtered[["gpt_label","count"]]}")
            
            #Save with nonrepeating rows
            df_gpt_temp = df_prior[df_prior.topic == topic].copy() 
            df_gpt_temp["predicted_label"] = label
            df_gpt_label = pd.concat([df_gpt_label,df_gpt_temp], ignore_index=True)
            
    else:
        top_label = label_counts.sort_values("count", ascending=False).iloc[0]["gpt_label"]
        df_add = df_whole[df_whole["topic"] == topic].copy()
        df_add["predicted_label"] = top_label
    

        label_count = label_counts[(label_counts["gpt_label"] == top_label) & (label_counts["topic"] == topic)]["count"].values[0]
        df_add["weight"] = round( label_count / (n*2), 3)
        df_final = pd.concat([df_final, df_add], ignore_index=True)
        
        df_gpt_temp = df_prior[df_prior.topic == topic].copy() 
        df_gpt_temp["predicted_label"] = label
        df_gpt_label = pd.concat([df_gpt_label,df_gpt_temp], ignore_index=True)
        # print(f"less than half:\n{label_counts[label_counts['gpt_label'] == top_label][['gpt_label', 'count']]}")
#gpt label df



topic: 0
n: 150
topic: 1
n: 150
topic: 2
n: 150
topic: 3
n: 150
topic: 4
n: 150
topic: 5
n: 150
topic: 6
n: 150
topic: 7
n: 150
topic: 8
n: 150
topic: 9
n: 150
topic: 10
n: 150
topic: 11
n: 150
topic: 12
n: 150
topic: 13
n: 150
topic: 14
n: 150
topic: 15
n: 150
topic: 16
n: 150
topic: 17
n: 150
topic: 18
n: 147
topic: 19
n: 145
topic: 20
n: 144
topic: 21
n: 143
topic: 22
n: 142
topic: 23
n: 140
topic: 24
n: 77
topic: 25
n: 77
topic: 26
n: 74
topic: 27
n: 73
topic: 28
n: 72
topic: 29
n: 70
topic: 30
n: 70
topic: 31
n: 35
topic: 32
n: 35
topic: 33
n: 34
topic: 34
n: 32
topic: 35
n: 32
topic: 36
n: 32
topic: 37
n: 31
topic: 38
n: 31
topic: 39
n: 31
topic: 40
n: 31
topic: 41
n: 30
topic: 42
n: 30
topic: 43
n: 30
topic: 44
n: 30
topic: 45
n: 30
topic: 46
n: 29
topic: 47
n: 29
topic: 48
n: 29
topic: 1000
n: 150
topic: 1001
n: 150
topic: 1002
n: 149
topic: 1003
n: 140
topic: 1004
n: 76
topic: 1005
n: 76
topic: 1006
n: 76
topic: 1007
n: 73
topic: 1008
n: 73
topic: 1009
n: 69
topic: 1010
n: 35


In [10]:
df_final.shape

(94412, 5)

In [11]:
df_final.sample(100)

Unnamed: 0,cid,text,topic,predicted_label,weight
23770,bafyreia2tcyuii4zd2ui33extkarmuzlhbbtbo2cct5cf...,Living in Western NC and having had a devastat...,4,Politics,0.913
89658,bafyreia26tyckwcsxcwnlvwebddzwks7dyzqnlacxw4sk...,"Global leaders, scientists, policymakers devel...",1017,Renewable,0.621
37676,bafyreihmoydn7m4qdlaan4ftrvdb6s4njboo46c2impw5...,pff Carls Jr is better than Hardees. Also is t...,8,Renewable,0.903
36034,bafyreid3peltyrjukaglct6po2qn2fv5cct4xqbizc3xj...,"evenings after about 7 or 8 ... Also, if you c...",8,Electricity,0.510
17869,bafyreihoo4zi7xeb2tdtys6kzmgaalcwc5xtjbbqqi6l6...,As Seen On Shark Tank an alternative fuel for ...,2,Fossil,0.873
...,...,...,...,...,...
40862,bafyreicndhz74kgaiunjvvttk46hucqx5zunv4nwv6bzp...,Join us for the International Plastic Pellet C...,9,Waste,0.953
36990,bafyreihpkddqkyaknwj4kl3hxdebtrdyi35ng2c4wjlom...,you've likely seen rolf's solarsiedlung - outs...,8,Renewable,0.903
71206,bafyreieim2kqut6h2czzmlcxighy2tdtw4in5s3ld3gib...,This is a time to accelerate deployment of cle...,1000,Electricity,0.527
66554,bafyreihya3o27btgbhsbminscejrl74ygkwjjsmfq2w4e...,pv-magazine-usa.com20250416t... Texas HAS been...,36,Politics,0.625


In [12]:
save_path = f"{path}/result"

In [13]:
os.makedirs(save_path,exist_ok=True)
df_final.to_pickle(f"{save_path}/topic_predictions_clean.pkl")

In [14]:
df_gpt_label.to_pickle(f"{save_path}/gpt_topic_predictions_clean.pkl")

In [15]:
#save gpt temp
df_gpt_label.sample(100)

Unnamed: 0,cid,text,gpt_label,topic,predicted_label
14966,bafyreih3aisr5a745cbbpcmalzoyrqedtgxqfuhod5dpf...,"New on the UKSIF Opinion Page Lyna Merrar, ESG...","[Politics, Renewable]",1040,Politics
9454,bafyreibz47wmztj7gyk36q3pyepnzg7ykojbjd2g2godc...,Play them with the pitch still under the stadi...,"[Construction, Electricity]",38,Politics
9799,bafyreiabkywt62xbvx5lexlq7yx4lmo4xe4deb4y5ytfh...,Read some of it and it seems like by far the s...,[Nature],42,Nature
2621,bafyreibfp56mwaht74x33jlnuqk66usydjam2o5gcs6pr...,The benefits of agri-PV span from crop protect...,"[Agriculture, Renewable, Electricity]",8,Electricity
5254,bafyreidnnbfcf2sownb2rajuq7mnclnty6r7dspapn6f2...,Revealed Big tech’s new datacentres will take ...,"[Disaster, Electricity]",16,Waste
...,...,...,...,...,...
4210,bafyreibcypbxsnjkqvkbnew4lphx3ed6vkv46edbh5qv7...,My family spent money on rain barrels right be...,"[Renewable, Agriculture]",13,Politics
8898,bafyreifu7lea6sw4vctturhzlyudji7wsefe3lxero6s3...,So is he saying that Musk was exposed to an en...,"[Politics, Nature]",33,Politics
10870,bafyreifan2puhgcf53b5mjipavmkda5jrv3mi4v56mtl4...,Intermittent power cuts over the past hour an ...,"[Electricity, Lifestyle]",1000,Renewable
9094,,why r they putting lead in the toothpaste jesu...,"[Waste, Lifestyle]",35,Waste


In [16]:
#EDA
# Check basic info and nulls
print(df_final.info())
print("\nMissing values:\n", df_final.isnull().sum())

# Check if 'predicted_label' is missing or contains unexpected values
print("\nNumber of rows where predicted_label is missing:")
print(df_final['predicted_label'].isnull().sum())

# Check types of values in predicted_label
print("\nUnique predicted labels:")
print(df_final['predicted_label'].unique())

# Frequency of predicted labels
print("\nPredicted label counts:")
print(df_final['predicted_label'].value_counts(dropna=False))

# Look for rows with empty strings or suspicious content
print("\nRows where predicted_label is empty string:")
print(df_final[df_final['predicted_label'] == ''])

print("\nRows where predicted_label is not a string (e.g., int or None):")
print(df_final[~df_final['predicted_label'].apply(lambda x: isinstance(x, str))])

# Check for duplicates based on cid or full row
print("\nNumber of duplicate rows:", df_final.duplicated().sum())
print("Number of duplicate cids:", df_final['cid'].duplicated().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94412 entries, 0 to 94411
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   cid              94143 non-null  object 
 1   text             94412 non-null  object 
 2   topic            94412 non-null  int64  
 3   predicted_label  94412 non-null  object 
 4   weight           94412 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 3.6+ MB
None

Missing values:
 cid                269
text                 0
topic                0
predicted_label      0
weight               0
dtype: int64

Number of rows where predicted_label is missing:
0

Unique predicted labels:
['Politics' 'Fossil' 'Renewable' 'Transportation' 'Electricity' 'Waste'
 'Agriculture' 'Nature' 'Weather' 'Activism' 'Disaster' 'Construction']

Predicted label counts:
predicted_label
Politics          41930
Renewable         17372
Fossil             8264
Electricity        464

In [17]:
df_gpt_label[df_gpt_label.topic == 1012]

Unnamed: 0,cid,text,gpt_label,topic,predicted_label
13403,bafyreignwpcek2msay5sldapjhltzx3q4e245qecue7bh...,Brazil allocates USD 1.9 billion to climate fu...,"[Politics, Activism]",1012,Nature
13404,bafyreihmjnvsa3wmi3crnt25gswwpcfywmd342ovzeywg...,"""From now on, we will be united. Declaring to ...","[Activism, Politics]",1012,Nature
13405,bafyreiedj32ufzp675s6viffuf5osmzbi3xnrhuggsekt...,"Bibles, bullets and beef Amazon cowboy culture...","[Nature, Agriculture]",1012,Nature
13406,bafyreicm2dey5pxo3ilw4tu2tgx67eiifqbr3djlddw54...,Today we had the presence of the Minister of E...,"[Politics, Renewable]",1012,Nature
13407,bafyreiaopnozbhbiqgh55b4vaaogevtlrrsp5wl7xs4ov...,The RBC AGM has come to a close. How do we fee...,"[Activism, Politics]",1012,Nature
...,...,...,...,...,...
13468,bafyreiebou242i652kugzpodav24yfkbzzdipfnjgeq2m...,A new study has found that rivers across Brazi...,"[Nature, Agriculture]",1012,Nature
13469,bafyreiaku726gikbhizfpd5wi2ghmky7koipzjq3ax7v5...,Oil Exploration in the Amazon A six-part serie...,"[Disaster, Activism, Nature]",1012,Nature
13470,bafyreidgsouqa6ru2vetfxnv33xxtsss7qrs4ftgvdsbz...,Through renewables got to keep fossil fuels in...,"[Activism, Renewable]",1012,Nature
13471,bafyreifumkcf7fauzw2itignp5hkgb3usvw2tejgozf2o...,BakuBelém Yesterday’s briefing to diplo missio...,"[Politics, Renewable]",1012,Nature


In [18]:
filtered = label_counts[label_counts["count"] > 10]
# filtered[filtered["count"] == 3].shape
filtered

Unnamed: 0,topic,gpt_label,count
5,1048,Politics,21


In [19]:
df_final[df_final.topic==3].head()

Unnamed: 0,cid,text,topic,predicted_label,weight
19299,bafyreibscp2vq5za5kbs2hw64jxpsu7of3jst2mldlwgl...,What aspect of it is art But also please answe...,3,Renewable,0.39
19300,bafyreif4msuuwitl722q43xeifuyxf333sd255q6hubmp...,Must direct énergies where they will not be wa...,3,Renewable,0.39
19301,bafyreib2lqwzsokcmyvwyzoqu5blskixwqe5clpzzjle6...,Pro-AI gets you muted here. It is both the too...,3,Renewable,0.39
19302,bafyreia6s4f6uvdhrw7tl4o2uykgvpkmnfpgy62lpqkxj...,"Ya, here where they are building data centers ...",3,Renewable,0.39
19303,bafyreianagvlrmveqo5p7syjxo4hpa73h22d36u7z2fkb...,Why do you support computer generated artAI Do...,3,Renewable,0.39
