In [1]:
from datasets import load_dataset
import pandas as pd

DATASET SOURCE:

@article{kiesel2019data,
  title={Data for pan at semeval 2019 task 4: Hyperpartisan news detection},
  author={Kiesel, Johannes and Mestre, Maria and Shukla, Rishabh and Vincent, Emmanuel and Corney, David and Adineh, Payam and Stein, Benno and Potthast, Martin},
  year={2019}
}

In [2]:
dataset = load_dataset('hyperpartisan_news_detection', 'bypublisher')
print("Done!")

Downloading builder script:   0%|          | 0.00/2.43k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading and preparing dataset hyperpartisan_news_detection/bypublisher (download: 956.72 MiB, generated: 5.23 GiB, post-processed: Unknown size, total: 6.16 GiB) to /root/.cache/huggingface/datasets/hyperpartisan_news_detection/bypublisher/1.0.0/7f4215b0474950ddf516e806400ab81d098b3da3b3a919a13cd1a4cf2c677012...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/981M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/22.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/600000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/600000 [00:00<?, ? examples/s]

Dataset hyperpartisan_news_detection downloaded and prepared to /root/.cache/huggingface/datasets/hyperpartisan_news_detection/bypublisher/1.0.0/7f4215b0474950ddf516e806400ab81d098b3da3b3a919a13cd1a4cf2c677012. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Done!


In [3]:
df = dataset["train"].to_pandas()

In [4]:
# Load each bias variable based on information from the dataset

far_left = df.loc[(df['bias'] == 4) & (df['hyperpartisan'] == True), 'text']
liberal = df.loc[df['bias'] == 3, 'text']
moderate = df.loc[df['bias'] == 2, 'text']
conservative = df.loc[df['bias'] == 1, 'text']
far_right = df.loc[(df['bias'] == 0) & (df['hyperpartisan'] == True), 'text']

In [5]:
# Convert list to strings for tokenization

far_left_string = ' '.join(far_left.tolist())
liberal_string = ' '.join(liberal.tolist())
moderate_string = ' '.join(moderate.tolist())
conservative_string = ' '.join(conservative.tolist())
far_right_string = ' '.join(far_right.tolist())

In [6]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.4.0
[0m

In [7]:
# Tokenize for each bias

import tiktoken

enc = tiktoken.encoding_for_model('text-davinci-003')

print("Start tokening!")
far_left_token = enc.encode(far_left_string)
print("Far-left done")
liberal_token = enc.encode(liberal_string)
print("Liberal done")
moderate_token = enc.encode(moderate_string)
print("Moderate done")
conservative_token = enc.encode(conservative_string)
print("Conservative done")
far_right_token = enc.encode(far_right_string)
print("Far-right done")

Start tokening!
Far-left done
Liberal done
Moderate done
Conservative done
Far-right done


In [8]:
# Get 5000 most common tokens for each bias


from collections import Counter

print("Running")
far_left_dict = Counter(far_left_token).most_common(5000)
print("Far-left done")
liberal_dict = Counter(liberal_token).most_common(5000)
print("Liberal done")
moderate_dict = Counter(moderate_token).most_common(5000)
print("Moderate done")
conservative_dict = Counter(conservative_token).most_common(5000)
print("Conservative done")
far_right_dict = Counter(far_right_token).most_common(5000)
print("Far-right done")

Running
Far-left done
Liberal done
Moderate done
Conservative done
Far-right done


In [9]:
# If any token is shared by three or more biases, remove from all shared biases in order to get unique bias tokens


far_left_common_tokens = [elem[0] for elem in far_left_dict]
liberal_common_tokens = [elem[0] for elem in liberal_dict]
moderate_common_tokens = [elem[0] for elem in moderate_dict]
conservative_common_tokens = [elem[0] for elem in conservative_dict]
far_right_common_tokens = [elem[0] for elem in far_right_dict]

lists = [far_left_common_tokens, liberal_common_tokens, moderate_common_tokens, conservative_common_tokens, far_right_common_tokens]
counts = {}
commons = []

for l in lists:
    for elem in l:
        if elem in counts:
            counts[elem] += 1
        else:
            counts[elem] = 1
            
for key, val in counts.items():
    if val >= 3 or key >= 50280:
        commons.append(key)
        for l in lists:
            if key in l:
                l.remove(key)

print("Done")

Done


In [12]:
# Get accurate counts for each remaining token, make 100x more likely to be chosen
# Append key tokens to list


key_extentions = {5512: 2, 1546: 2, 13965: 2, 49560: 2, 25: 2, 16284: 2, 1565: 2, 36: 2, 17961: 2, 1581: 2, 1847: 2, 58: 2, 60: 2, 3398: 2, 3528: 2, 15567: 2, 6489: 2, 90: 2, 6234: 2, 220: 2, 92: 2, 11357: 2, 6369: 2, 2538: 2}

far_left_counts = {}
for token, count in far_left_dict:
    if token in far_left_common_tokens:
        far_left_counts[token] = (count/len(far_left_token)) * 100
far_left_scores = dict(Counter(far_left_counts).most_common(250))
far_left_scores.update(key_extentions)

liberal_counts = {}
for token, count in liberal_dict:
    if token in liberal_common_tokens:
        liberal_counts[token] = (count/len(liberal_token)) * 100
liberal_scores = dict(Counter(liberal_counts).most_common(250))
liberal_scores.update(key_extentions)

moderate_counts = {}
for token, count in moderate_dict:
    if token in moderate_common_tokens:
        moderate_counts[token] = (count/len(moderate_token)) * 100
moderate_scores = dict(Counter(moderate_counts).most_common(250))
moderate_scores.update(key_extentions)

conservative_counts = {}
for token, count in conservative_dict:
    if token in conservative_common_tokens:
        conservative_counts[token] = (count/len(conservative_token)) * 100
conservative_scores = dict(Counter(conservative_counts).most_common(250))
conservative_scores.update(key_extentions)

far_right_counts = {}
for token, count in far_right_dict:
    if token in far_right_common_tokens:
        far_right_counts[token] = (count/len(far_right_token)) * 100
far_right_scores = dict(Counter(far_right_counts).most_common(250))
far_right_scores.update(key_extentions)

print("Done")

Done


In [13]:
# Save token count lists as JSON files


import json

with open('/kaggle/working/far-left_scores.json', 'w') as f:
    json.dump(far_left_scores, f)
    
with open('/kaggle/working/liberal_scores.json', 'w') as f:
    json.dump(liberal_scores, f)
    
with open('/kaggle/working/moderate_scores.json', 'w') as f:
    json.dump(moderate_scores, f)
    
with open('/kaggle/working/conservative_scores.json', 'w') as f:
    json.dump(conservative_scores, f)
    
with open('/kaggle/working/far-right_scores.json', 'w') as f:
    json.dump(far_right_scores, f)

print("Done!")

Done!
