In [14]:
import pandas as pd
import json
import os
import re

os.chdir("..")

### BASIL

In [None]:
folder_path = "../data/raw/basil"

all_rows = []

source_mapping = {"fox": "fox-news",
                  "nyt": "the-new-york-times",
                  "hpo": "huffpost"}

# Loop through each JSON file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        
        # Load JSON data
        with open(file_path, 'r') as file:
            data = json.load(file)
        

        for entry in data.get("body", []):
            sentence = entry.get("sentence", "")
            
            # Check if any annotation in the sentence has lexical bias
            has_lexical_bias = any(annotation.get("bias") == "Lexical" for annotation in entry.get("annotations", []))

            source = data.get("source", "").lower()
            source = source_mapping.get(source, source)
            
            row = {
                "text": sentence,
                "date": data.get("date", ""),
                "source": source,
                "article_main_entities": ", ".join(data.get("main-entities", [])),
                "label": int(has_lexical_bias)  # 1 if lexical bias exists, otherwise 0
            }
            
            all_rows.append(row)

df = pd.DataFrame(all_rows)

In [None]:
df.groupby("source")["label"].value_counts()

#its very sparse!

source              label
fox-news            0        2498
                    1         141
huffpost            0        2147
                    1         149
the-new-york-times  0        2891
                    1         158
Name: count, dtype: int64

### ANNOMATIC

In [22]:
ad_fontes = pd.read_csv("../data/adfontes_clean.csv")

ad_fontes["source"].unique().tolist()

annomatic =  pd.read_parquet("../data/raw/anno-lexical-train.parquet")
annomatic = annomatic.merge(ad_fontes, left_on= 'source_name', right_on= "source", how= 'left')

In [25]:
annomatic[annomatic["source"].isna()]["source_name"].unique()

array(['san-diego-uniontribune', 'atlanta-journalconstitution', 'nj',
       'insider', 'chicago-suntimes', 'the-advocate',
       'pittsburgh-postgazette'], dtype=object)

In [28]:
source_mapping = {
    "the-advocate": "the-advocate-–-baton-rouge",
    "atlanta-journalconstitution": "atlanta-journal-constitution",
    "chicago-suntimes": "chicago-sun-times",
    "san-diego-uniontribune": "san-diego-union-tribune",
    "pittsburgh-postgazette": "pittsburgh-post-gazette",
    "nj": "nj.com",
    "insider": "business-insider"
}


annomatic =  pd.read_parquet("../data/raw/anno-lexical-train.parquet")
annomatic["source_name"] = annomatic["source_name"].replace(source_mapping)

annomatic = annomatic.merge(ad_fontes, left_on = 'source_name', right_on= 'source', how= 'left')
annomatic = annomatic.drop(columns=['source_name'])
annomatic

Unnamed: 0,text,source_party,label,sentence_id,source,reliability,bias
0,"General Admission: $25, $20 for seniors and mi...",Lean Left,0,31bef2d8-f7e5-42ce-a278-ae201dd3e1fe,san-diego-union-tribune,45.68,-1.76
1,Proceeds benefit children and adults with disa...,Lean Left,0,412c7cde-e3b4-46c5-b91d-c0b7b1d813e1,san-diego-union-tribune,45.68,-1.76
2,One day before he is set to surrender to autho...,Left,1,6e2d7083-52da-4072-989b-ca42e5d69679,alternet,32.71,-14.76
3,"In America, Paul snapped a picture of newsmen ...",Lean Right,0,b93dacc5-9fe1-4da9-aa18-32271b836feb,boston-herald,36.31,9.68
4,None of the diarists of ages and administratio...,Lean Left,1,1f161884-32a0-4f7d-824e-907826b1669b,the-guardian,40.67,-7.97
...,...,...,...,...,...,...,...
33826,"When lawsuits started piling up, the Sacklers ...",Lean Left,1,04eae3d6-da64-474f-876e-5c12d79586f5,the-economist,42.40,-1.43
33827,As almost always is the case when Republicans ...,Lean Right,1,a2849664-4a15-4458-a07d-32349cb16572,the-dispatch,41.19,5.33
33828,DeSantis has taken an aggressive - and controv...,Lean Left,1,107d49a1-6b7e-4619-81dc-e881ff30fd79,nbc-news,43.21,-5.79
33829,It is one of the most significant investigativ...,Center,0,804d894d-f0c4-4600-856a-c0761a0e9947,poynter,41.75,-7.32


### Starbucks 

A lot of media missing from all sides/adfontes

In [27]:

#?rare guns the sun rte tekegraph recode dallas-news bbc-news digital-journal deccan-herald, tasnim-news-agency-(press-release) the-national-memo-(blog) valuewalk

source_mapping = {
    'nbcnews': 'nbc-news',
    'time': 'time-magazine',
    'sputnik-international': 'sputnik-international-news',
    'new-york-times': 'the-new-york-times',
    'the-hill': 'hill-reporter',
    'breitbart-news': 'breitbart',
    'american-thinker-(blog)': 'american-thinker',
    'dallas-news': 'dallas-morning-news',
    'bbc-news': 'bbc'
}

folder_path = "../data/raw"
starbucks = pd.read_csv(os.path.join(folder_path,"Sora_LREC2020_biasedsentences.csv"))

starbucks['source'] = starbucks['source'].apply(lambda x: '-'.join(re.sub(r'_\d+', '', x.split('_', 1)[1]).lower().split()))
starbucks['source'] = starbucks['source'].replace(source_mapping)

starbucks = starbucks.merge(ad_fontes, on= 'source', how= 'left')

In [30]:
row_list = []
for _, df in starbucks.groupby("id_article"):
    # Extract values that are the same across the group (first row of the group)
    source = df['source'].iloc[0]
    reliability = df['reliability'].iloc[0]
    bias = df['bias'].iloc[0]
    source_bias = df['source_bias'].iloc[0]
    date = df['date_event'].iloc[0]

    # Add title entry
    row_list.append({
        "text": df["doctitle"].iloc[0],
        "label": df["t"].mean(),
        "source": source,
        "reliability": reliability,
        "bias": bias,
        "source_bias": source_bias,
        "date": date
    })

    # Add sentence entries
    for sent in range(20):
        # Check if sentence exists in the article (it might be missing)
        if not df["s" + str(sent)].any():
            continue

        sentence = df["s" + str(sent)].iloc[0]  # sentences are the same within the group, take the first one
        label = df[str(sent)].mean()  # we take the mean of annotations for the sentence

        # Append sentence information with additional columns
        row_list.append({
            "text": sentence,
            "source": source,
            "label": label,
            "reliability": reliability,
            "bias": bias,
            "source_bias": source_bias,
            "date": date
        })

# Convert the row list to a DataFrame
data = pd.DataFrame(row_list)

data["text"] = data["text"].apply(lambda x: re.sub(r"(\[[0-9]*\]:\ )", "", x))
data["label"] = (data["label"] - data["label"].min()) / (data["label"].max() - data["label"].min())

data

Unnamed: 0,text,label,source,reliability,bias,source_bias,date
0,Dan Johnson suicide: Lawmaker accused of moles...,0.727273,washington-post,38.8,-6.85,left-center,2017-12-15
1,LOUISVILLE - Dan Johnson posted a final messag...,0.363636,washington-post,38.8,-6.85,left-center,2017-12-15
2,It appeared to be a goodbye.,0.727273,washington-post,38.8,-6.85,left-center,2017-12-15
3,"In it, he denied the accusations that had torm...",0.545455,washington-post,38.8,-6.85,left-center,2017-12-15
4,"""GOD knows the truth, nothing is the way they ...",0.363636,washington-post,38.8,-6.85,left-center,2017-12-15
...,...,...,...,...,...,...,...
883,But I'm also confident that Secretary Mattis w...,0.363636,valuewalk,,,right-center,2017-12-13
884,Tillerson also emphasized his partnership with...,0.145455,valuewalk,,,right-center,2017-12-13
885,The change in position comes on the heels of P...,0.072727,valuewalk,,,right-center,2017-12-13
886,The Hwasong-15 missile that was launched Novem...,0.290909,valuewalk,,,right-center,2017-12-13


### BABE

In [35]:
source_mapping = {
    "federalist": "the-federalist",
    "new-york-times": "the-new-york-times",
}
# the-daily-stormer is missing
bias_mapping = {"Biased" : 1, "Non-biased": 0}

df = pd.read_csv("../data/raw/babe.csv", delimiter= ";")
df["outlet"] = df["outlet"].str.lower().str.replace(" ", "-")
df["outlet"] = df["outlet"].replace(source_mapping)

df["label_bias"] = df["label_bias"].map(bias_mapping)

df = df.merge(ad_fontes, left_on = 'outlet', right_on= 'source', how= 'left')

df.head()

Unnamed: 0,text,news_link,outlet,topic,type,label_bias,label_opinion,biased_words,source,reliability,bias
0,"""Orange Is the New Black"" star Yael Stone is r...",https://www.foxnews.com/entertainment/australi...,fox-news,environment,right,0.0,Entirely factual,[],fox-news,35.44,11.06
1,"""We have one beautiful law,"" Trump recently sa...",https://www.alternet.org/2020/06/law-and-order...,alternet,gun control,left,1.0,Somewhat factual but also opinionated,"['bizarre', 'characteristically']",alternet,32.71,-14.76
2,"...immigrants as criminals and eugenics, all o...",https://www.nbcnews.com/news/latino/after-step...,msnbc,white-nationalism,left,1.0,Expresses writer’s opinion,"['criminals', 'fringe', 'extreme']",msnbc,34.51,-14.03
3,...we sounded the alarm in the early months of...,https://www.alternet.org/2019/07/fox-news-has-...,alternet,white-nationalism,left,1.0,Somewhat factual but also opinionated,[],alternet,32.71,-14.76
4,[Black Lives Matter] is essentially a non-fals...,http://feedproxy.google.com/~r/breitbart/~3/-v...,breitbart,marriage-equality,,1.0,Expresses writer’s opinion,['cult'],breitbart,31.15,13.7


In [33]:
df[df["label_bias"].isna()]

Unnamed: 0,text,news_link,outlet,topic,type,label_bias,label_opinion,biased_words,source,reliability,bias
2143,Proponents of these vaccines twist the Vatican...,https://thefederalist.com/2020/05/06/as-long-a...,the-federalist,vaccines,right,,Expresses writer’s opinion,[],the-federalist,21.77,18.67


In [36]:
df[df['label_opinion'] == "No agreement"]

Unnamed: 0,text,news_link,outlet,topic,type,label_bias,label_opinion,biased_words,source,reliability,bias
41,"A cop shoots a Black man, and a police union f...",https://www.reuters.com/investigates/special-r...,reuters,gun control,center,1.0,No agreement,"['flexes', 'its', 'muscle']",reuters,45.18,-1.17
94,A previous boom that saw gun sales double over...,https://www.reuters.com/article/us-usa-guns-sa...,reuters,gun-control,center,1.0,No agreement,['aficionados'],reuters,45.18,-1.17
95,A professor who teaches climate change classes...,https://www.breitbart.com/politics/2019/05/09/...,breitbart,environment,right,0.0,No agreement,[],breitbart,31.15,13.70
121,A Supreme Court filing lays bare the deep chas...,https://www.alternet.org/2020/03/prominent-rep...,alternet,taxes,left,1.0,No agreement,['wannabe'],alternet,32.71,-14.76
170,Activists planning to line California roadways...,https://www.nbcnews.com/tech/internet/anti-vac...,msnbc,vaccines,left,0.0,No agreement,[],msnbc,34.51,-14.03
...,...,...,...,...,...,...,...,...,...,...,...
3603,White supremacist violent extremists can gener...,https://www.nbcnews.com/news/latino/after-step...,msnbc,white-nationalism,left,1.0,No agreement,"['violent', 'virulent', 'supremacist', 'hatred']",msnbc,34.51,-14.03
3618,With Democrats fuming over Trump's push for a ...,https://www.foxnews.com/politics/democrats-tro...,the-federalist,gun-control,right,0.0,No agreement,['fuming'],the-federalist,21.77,18.67
3631,"With this in mind, the Trump administration re...",http://www.msnbc.com/rachel-maddow-show/trump-...,msnbc,middle-class,left,1.0,No agreement,['tinkering'],msnbc,34.51,-14.03
3655,Yet it far better applies to unelected career ...,https://thefederalist.com/2020/02/17/doj-furor...,the-federalist,elections-2020,right,0.0,No agreement,['ironies'],the-federalist,21.77,18.67


### All Sides

In [37]:

df = pd.read_csv("../data/AllSides_Media_Bias_Ratings_111224.csv")

df.columns = df.columns.str.replace('allsides_media_bias_ratings/publication/', '')

df["source_name"] = df["source_name"].str.lower().str.replace(" ", "-")

df["source_name"].unique().tolist()

['12-news-now-kbmt',
 '12news',
 '1819-news',
 '27-east',
 '4029-news',
 '538-(abc-news)',
 '9-&-10-news',
 'a-project-for-america',
 'a-starting-point',
 'aarp',
 'abc-11-wtvd',
 'abc-12',
 'abc-13-wtvg',
 'abc-15-arizona',
 'abc-16-wapt',
 'abc-25-kxxv',
 'abc-27-whtm',
 'abc-27-wkow',
 'abc-4-wtae',
 'abc-47-wmdt',
 'abc-5-news',
 'abc-57',
 'abc-6',
 'abc-6-kaal',
 'abc-6-wpvi',
 'abc-7-chicago',
 'abc-7-ketv',
 'abc-7-kvia',
 'abc-7-new-york',
 'abc-7-wxyz',
 'abc-8-klkn',
 'abc-8-wric',
 'abc-9-wmur',
 'abc-news-(online)',
 'abc-news-10-vermont',
 'abc13-houston',
 'abc4-utah',
 'aberdeen-news',
 'above-the-law',
 'abridge-news',
 'accuracy-in-media-',
 'aclu',
 'action-news-jax',
 'addison-county-independent',
 'adventist-today',
 'advocate',
 'afp-fact-check',
 'aframnews',
 'african-american-conservatives',
 'african-american-intellectual-history-society',
 'afro',
 'aiken-standard',
 'aj+',
 'akron-beacon-journal',
 'al-jazeera',
 'al.com',
 'alabama-news-center',
 'alabama-n

### Try inference from HuggingFace Hub

In [1]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
model_name = "skarsa/basil_topic_subsamples_model_alpha_0_5_idx_3"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [4]:

text = "Trump is fucking stupid."
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Perform inference
with torch.no_grad():
    outputs = model(**inputs)

# Get logits and apply softmax
logits = outputs.logits
probabilities = torch.nn.functional.softmax(logits, dim=-1)

# Get the predicted class
predicted_class = torch.argmax(probabilities, dim=-1).item()
print(f"Predicted class: {predicted_class}")

Predicted class: 0
