In [240]:
import pandas as pd
import os
import json

In [241]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)  # For older pandas
pd.set_option('display.max_colwidth', None)  # For newer pandas

In [242]:
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
pd.reset_option('display.width')
pd.reset_option('display.max_colwidth')


In [243]:
#Loading the entire output of BERTopic
df_whole = pd.read_json("../../data/BERTopicResult/BERTopic_final_result.json",lines=True)
df_whole.shape

(90213, 3)

In [244]:
path = "../../data/llm_subtopic/multi_label/larger_sample_size"
all_rows = []

for filename in os.listdir(path):
    if not filename.endswith(".json"):
        continue
    df = pd.read_json(f"{path}/{filename}")
    
    for idx, row in df.iterrows():
        topic = row["topic"]
        
        sample = row["samples"]
        # Add topic to each sample dict
        sample["topic"] = topic
        all_rows.append(sample)
df_prior = pd.DataFrame(all_rows)    


#split label predictions into list of strings
df_prior["gpt_label"] = df_prior["gpt_label"].apply(lambda x: [label.strip() for label in x.split(",")])

#separate label prediction into additional rows in the df
df_labels = df_prior.explode("gpt_label").reset_index(drop=True)

df_labels.shape

(24755, 4)

In [245]:
df_prior.head(5)
# df_prior[df_prior.topic == 1045].head().gpt_label.iloc[0]
# df_prior.shape

Unnamed: 0,cid,text,gpt_label,topic
0,bafyreihxfqbtq23vwj2mldk6huwkovfyq7yzibzf7wsv3...,President Trump Trying To Remove 150 Years Of ...,"[Politics, Activism]",0
1,bafyreieqygyajo3rsx4ntye5jj4kyxxl33rzfhcqqjki6...,2 Fix the real shit that is broken. Like minim...,"[Activism, Politics]",0
2,bafyreiasdqzid6rvbkvwzkeuwrwfqyxbdno4n6kom5gii...,Today is 7years after Latour wrote his book. C...,"[Politics, Disaster]",0
3,bafyreid7ok2wsoe3wlzswdf246drcasvrirux46uaoabv...,Deb Fischer might be one of the stupidest peop...,"[Politics, Activism]",0
4,bafyreiexy54ja7upnu4ahbxj4ggjdhiet3e3iglc6y4ka...,Join me in following myzerocarbon.org! The cur...,"[Activism, Politics]",0


In [246]:
df_labels.head()
df_labels.shape

(24755, 4)

In [247]:
df_whole.head()

Unnamed: 0,cid,text,topic
0,bafyreihmqtib6dm76dgrnpiz5gdbzzcnskqpjovbi3owy...,Thank you hayleysmith.bsky.social latimes.com ...,7
1,bafyreic5q4z3jyudbkjj24t433h3giy6oicy6rv2ih4vz...,Can't wait for his demise. Never ever have I w...,0
2,bafyreibscp2vq5za5kbs2hw64jxpsu7of3jst2mldlwgl...,What aspect of it is art But also please answe...,3
3,bafyreiglnfw2lfy6msf7n3cx4bouno75znasu4hrgnnrt...,ClimateChanger denier at my Hurricanes demo st...,0
4,bafyreiaeny4uerr5osifz7fjsqwivnv7n7d7dpw5svja2...,Day 56 of trying to work with nature to reduce...,9


In [248]:
#get the counts, create final df
df_final = pd.DataFrame()
df_gpt_label = pd.DataFrame()

for topic in df_labels.topic.unique():
    df_topic = df_prior[df_prior.topic==topic]
    n = int(df_topic.shape[0]/2)
    df_topic = df_labels[df_labels.topic==topic]
    print(f"topic: {topic}")
    print(f"n: {n}")
    
    label_counts = df_topic.groupby(["topic", "gpt_label"]).size().reset_index(name="count")
    # print(f"counts:\n{label_counts}")
    filtered = label_counts[label_counts["count"] > n]
    if not filtered.empty:
        for i in range(filtered.shape[0]):
            row = filtered.iloc[i]
            label = row["gpt_label"]
            # print(f"label to predict: {label}")
            df_add = df_whole[df_whole["topic"] == topic].copy()
            df_add["predicted_label"] = label

            
            label_count = label_counts[(label_counts["gpt_label"] == label) & (label_counts["topic"] == topic)]["count"].values[0]
            df_add["weight"] = round(label_count / (n*2), 3)
            df_final = pd.concat([df_final, df_add], ignore_index=True)
            # print(f"above half:\n{filtered[["gpt_label","count"]]}")
            
            df_gpt_temp = df_prior[df_prior.topic == topic].copy() 
            df_gpt_temp["predicted_label"] = label
            df_gpt_label = pd.concat([df_gpt_label,df_gpt_temp], ignore_index=True)
    else:
        top_label = label_counts.sort_values("count", ascending=False).iloc[0]["gpt_label"]
        df_add = df_whole[df_whole["topic"] == topic].copy()
        df_add["predicted_label"] = top_label
    

        label_count = label_counts[(label_counts["gpt_label"] == top_label) & (label_counts["topic"] == topic)]["count"].values[0]
        df_add["weight"] = round( label_count / (n*2), 3)
        df_final = pd.concat([df_final, df_add], ignore_index=True)
        
        df_gpt_temp = df_prior[df_prior.topic == topic].copy() 
        df_gpt_temp["predicted_label"] = label
        df_gpt_label = pd.concat([df_gpt_label,df_gpt_temp], ignore_index=True)
        # print(f"less than half:\n{label_counts[label_counts['gpt_label'] == top_label][['gpt_label', 'count']]}")
#gpt label df



topic: 0
n: 150
topic: 1
n: 150
topic: 2
n: 150
topic: 3
n: 150
topic: 4
n: 150
topic: 5
n: 150
topic: 6
n: 150
topic: 7
n: 150
topic: 8
n: 150
topic: 9
n: 150
topic: 10
n: 150
topic: 11
n: 150
topic: 12
n: 150
topic: 13
n: 150
topic: 14
n: 150
topic: 15
n: 150
topic: 16
n: 150
topic: 17
n: 150
topic: 18
n: 147
topic: 19
n: 145
topic: 20
n: 144
topic: 21
n: 143
topic: 22
n: 142
topic: 23
n: 140
topic: 24
n: 77
topic: 25
n: 77
topic: 26
n: 74
topic: 27
n: 73
topic: 28
n: 72
topic: 29
n: 70
topic: 30
n: 70
topic: 31
n: 35
topic: 32
n: 35
topic: 33
n: 34
topic: 34
n: 32
topic: 35
n: 32
topic: 36
n: 32
topic: 37
n: 31
topic: 38
n: 31
topic: 39
n: 31
topic: 40
n: 31
topic: 41
n: 30
topic: 42
n: 30
topic: 43
n: 30
topic: 44
n: 30
topic: 45
n: 30
topic: 46
n: 29
topic: 47
n: 29
topic: 48
n: 29
topic: 1000
n: 150
topic: 1001
n: 150
topic: 1002
n: 149
topic: 1003
n: 140
topic: 1004
n: 76
topic: 1005
n: 76
topic: 1006
n: 76
topic: 1007
n: 73
topic: 1008
n: 73
topic: 1009
n: 69
topic: 1010
n: 35


In [249]:
df_final.shape

(102356, 5)

In [250]:
df_final.sample(100)

Unnamed: 0,cid,text,topic,predicted_label,weight
64454,bafyreigurhp44mgycpq2k54kdzwttzpwckyqtvaqavhj2...,"France Lowers 2030 Hydrogen Targets, Maintains...",25,Renewable,0.870
7582,bafyreiats7aiq5kyutsx23mklz6nbdoupuxbcbemroglx...,"So, are their only remaining policies nuclear ...",1,Politics,0.847
61251,bafyreibb6rgdzjyycotsyhnztr4hukrfcamisxbbm6tri...,Whole ecosystems decimated’ by huge rise in UK...,22,Disaster,0.581
9211,bafyreiajyjdcj3eklyfb2avvgbzcj5t527nztnxkaxyu5...,"No joke. Climate impacts ""not a punchline"" Pal...",1,Politics,0.847
49657,bafyreifaau4f4sxc3qamnqmwlpdadbp6ph3dfpibtmvoo...,Trump his chucklefvcks want to reshore manufac...,13,Politics,0.847
...,...,...,...,...,...
52666,bafyreid566a335u4dh37z2waulapucuhv7iruos53qouu...,Operating a heat pump in Massachusetts is abou...,15,Electricity,0.643
90673,bafyreiax3e6v4azz74fw35xsmc6hvy24gf2kbethimz4r...,"Also, if we would add the ecological dimension...",1007,Politics,0.521
80312,bafyreifczfeqca6wbbnc6czlpc5eorkkqxxghrkwavtx2...,"Occam’s razor If nuclear is slow, expensive, w...",1000,Renewable,0.737
38250,bafyreiepyaqrudv7at4fnamlblhjmn34sjiofsmhqwjsz...,ekurutepe.com Looking at apartments and Solar ...,8,Renewable,0.903


In [251]:
save_path = f"{path}/result"

In [None]:
os.makedirs(save_path,exist_ok=True)
df_final.to_pickle(f"{save_path}/topic_predictions.pkl")

In [None]:
df_gpt_label.to_pickle(f"{save_path}/gpt_topic_predictions.pkl")

In [254]:
#save gpt temp
df_gpt_label.head()

Unnamed: 0,cid,text,gpt_label,topic,predicted_label
0,bafyreihxfqbtq23vwj2mldk6huwkovfyq7yzibzf7wsv3...,President Trump Trying To Remove 150 Years Of ...,"[Politics, Activism]",0,Politics
1,bafyreieqygyajo3rsx4ntye5jj4kyxxl33rzfhcqqjki6...,2 Fix the real shit that is broken. Like minim...,"[Activism, Politics]",0,Politics
2,bafyreiasdqzid6rvbkvwzkeuwrwfqyxbdno4n6kom5gii...,Today is 7years after Latour wrote his book. C...,"[Politics, Disaster]",0,Politics
3,bafyreid7ok2wsoe3wlzswdf246drcasvrirux46uaoabv...,Deb Fischer might be one of the stupidest peop...,"[Politics, Activism]",0,Politics
4,bafyreiexy54ja7upnu4ahbxj4ggjdhiet3e3iglc6y4ka...,Join me in following myzerocarbon.org! The cur...,"[Activism, Politics]",0,Politics


In [255]:
#EDA
# Check basic info and nulls
print(df_final.info())
print("\nMissing values:\n", df_final.isnull().sum())

# Check if 'predicted_label' is missing or contains unexpected values
print("\nNumber of rows where predicted_label is missing:")
print(df_final['predicted_label'].isnull().sum())

# Check types of values in predicted_label
print("\nUnique predicted labels:")
print(df_final['predicted_label'].unique())

# Frequency of predicted labels
print("\nPredicted label counts:")
print(df_final['predicted_label'].value_counts(dropna=False))

# Look for rows with empty strings or suspicious content
print("\nRows where predicted_label is empty string:")
print(df_final[df_final['predicted_label'] == ''])

print("\nRows where predicted_label is not a string (e.g., int or None):")
print(df_final[~df_final['predicted_label'].apply(lambda x: isinstance(x, str))])

# Check for duplicates based on cid or full row
print("\nNumber of duplicate rows:", df_final.duplicated().sum())
print("Number of duplicate cids:", df_final['cid'].duplicated().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102356 entries, 0 to 102355
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   cid              101958 non-null  object 
 1   text             102356 non-null  object 
 2   topic            102356 non-null  int64  
 3   predicted_label  102356 non-null  object 
 4   weight           102356 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 3.9+ MB
None

Missing values:
 cid                398
text                 0
topic                0
predicted_label      0
weight               0
dtype: int64

Number of rows where predicted_label is missing:
0

Unique predicted labels:
['Politics' 'Fossil' 'Renewable' 'Transportation' 'Electricity' 'Waste'
 'Agriculture' 'Nature' 'Weather' 'Activism' 'Disaster' 'Construction'
 'Lifestyle']

Predicted label counts:
predicted_label
Politics          42655
Renewable         17480
Fossil             9423


In [256]:
label_counts[label_counts.topic == 1045]

Unnamed: 0,topic,gpt_label,count


In [257]:
filtered = label_counts[label_counts["count"] > 10]
# filtered[filtered["count"] == 3].shape
filtered

Unnamed: 0,topic,gpt_label,count
5,1048,Politics,21


In [258]:
df_final[df_final.topic==3].head()

Unnamed: 0,cid,text,topic,predicted_label,weight
19299,bafyreibscp2vq5za5kbs2hw64jxpsu7of3jst2mldlwgl...,What aspect of it is art But also please answe...,3,Renewable,0.39
19300,bafyreif4msuuwitl722q43xeifuyxf333sd255q6hubmp...,Must direct énergies where they will not be wa...,3,Renewable,0.39
19301,bafyreib2lqwzsokcmyvwyzoqu5blskixwqe5clpzzjle6...,Pro-AI gets you muted here. It is both the too...,3,Renewable,0.39
19302,bafyreia6s4f6uvdhrw7tl4o2uykgvpkmnfpgy62lpqkxj...,"Ya, here where they are building data centers ...",3,Renewable,0.39
19303,bafyreianagvlrmveqo5p7syjxo4hpa73h22d36u7z2fkb...,Why do you support computer generated artAI Do...,3,Renewable,0.39
