In [4]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import torch


df = pd.read_csv("merged_news_data.csv")


model = SentenceTransformer('all-MiniLM-L6-v2')


support_ban_examples = [
    "The government passed legislation to ban single-use plastics nationwide.",
    "A new campaign encourages consumers to stop using plastic bags and straws.",
    "Environmentalists celebrate new rules to phase out disposable plastics.",
    "Stores are replacing plastic packaging with biodegradable alternatives.",
    "New research shows plastic pollution is harming marine life, pushing for bans."
]

oppose_ban_examples = [
    "The new plastic ban is facing backlash from small business owners.",
    "Some restaurants are pushing back against restrictions on plastic straws.",
    "The mayor vetoed the proposed ban on single-use plastics.",
    "Critics argue plastic alternatives are too costly and impractical.",
    "A rollback on plastic regulations was praised by manufacturers."
]

neutral_examples = [
    "City officials held a meeting to discuss plastic regulations.",
    "A study explored trends in plastic consumption over the last decade.",
    "The report outlines various approaches to waste management in cities.",
    "Experts analyze the effects of plastic usage across industries.",
    "The recycling rate for plastics remained unchanged this year."
]

unrelated_examples = [
    "Plastic surgery is on the rise among millennials.",
    "A new toy made of plastic is gaining popularity this season.",
    "The artist uses melted plastic in abstract sculptures.",
    "Tips on organizing your home with plastic storage boxes.",
    "Trends in plastic furniture for minimalist interiors."
]



support_emb = model.encode(support_ban_examples, convert_to_tensor=True)
oppose_emb = model.encode(oppose_ban_examples, convert_to_tensor=True)
neutral_emb = model.encode(neutral_examples, convert_to_tensor=True)
unrelated_emb = model.encode(unrelated_examples, convert_to_tensor=True)


def label_article(text):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return "unlabeled"

    text_emb = model.encode(text, convert_to_tensor=True)

    scores = {
        "support_ban": util.pytorch_cos_sim(text_emb, support_emb).mean(),
        "oppose_ban": util.pytorch_cos_sim(text_emb, oppose_emb).mean(),
        "neutral": util.pytorch_cos_sim(text_emb, neutral_emb).mean(),
        "unrelated": util.pytorch_cos_sim(text_emb, unrelated_emb).mean()
    }

    best_label = max(scores, key=scores.get)
    return best_label


df['Semantic_Label_4'] = df['Text'].apply(label_article)

df.to_csv("labeled_dataset_4_categories.csv", index=False)
print("Done! Saved as labeled_dataset_4_categories.csv")

Done! Saved as labeled_dataset_4_categories.csv


In [6]:
df["Semantic_Label_4"].value_counts()

Semantic_Label_4
support_ban    2249
neutral         745
oppose_ban      356
unrelated        98
Name: count, dtype: int64

In [7]:
df.head()

Unnamed: 0,Text,Semantic_Label_4
0,Plastic On The Mind: Assessing the Risks From ...,support_ban
1,Scenic loch becomes magnet for Scotland's plas...,neutral
2,Trump rolls back standards for water-using app...,oppose_ban
3,Why Aren’t We Losing Our Minds Over the Plasti...,support_ban
4,36 Products That’ll Help Get You Through TSA F...,support_ban
