### Toy problem to count all words in the dataset in order to test reading the dataset

In [1]:
from datasets import load_dataset
import nltk
import itertools
from collections import Counter
from tqdm import tqdm
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords
import re



In [2]:
# deze cel downloadet de dataset
# hij gebruikt het script cnn_dailymail.py van huggingface:
# https://huggingface.co/datasets/cnn_dailymail/tree/main
# dit kan wel iets van 10 minuten duren, ga maar wat koffie halen

dataset = load_dataset("cnn_dailymail.py", "3.0.0", split="train") 

Reusing dataset cnn_dailymail (/home/erik/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


In [3]:
# print de features en het aantal datapunten in de dataset
print(dataset.features)
print(dataset.num_rows)

{'article': Value(dtype='string', id=None), 'highlights': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None)}
287113


In [4]:
# methoden om de data te pre-processen
en_stemmer = EnglishStemmer() # stemmer voor engelse woorden
nltk.download('stopwords') # stopwoorden die niet veel waarde toevoegen
stop_words = set(stopwords.words('english'))
alph_string_pattern = re.compile("[a-zA-Z]") # filtert 'woorden' die niet beginnen met een letter, zoals interpunctietokens


def word_counter_text(text: str, stem=False, remove_stopwords=False):
    """
    Neemt als input een string tekst
    Returnt een Counter object die alle woorden uit de tekst telt
    """
    
    # splits de tekst op in een lijst van woorden
    sents = nltk.tokenize.sent_tokenize(text)
    words = [nltk.word_tokenize(sent) for sent in sents]
    flatten_words = list(itertools.chain(*words))
    
    # woorden stemmen of alleen maar hoofdletters weghalen
    if stem:
        flatten_lower_words = [en_stemmer.stem(str) for str in flatten_words]
    else:
        flatten_lower_words = [str.lower() for str in flatten_words]
    
    # stopwoorden weghalen
    if remove_stopwords:
        flatten_lower_words = [str for str in flatten_lower_words if str not in stop_words]
        
    # tokens die niet beginnen met een letter weghalen
    flatten_lower_words = [str for str in flatten_lower_words if alph_string_pattern.match(str)]
    
    return Counter(flatten_lower_words)

[nltk_data] Downloading package stopwords to /home/erik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
words_accident = ['accident','disaster','catastrophe','incident','near-miss']

words_damage = ['victim','casualties','died','killed','damage','harm','hospital','hospitalized',
                'wounded','succumbed','unscathed','evacuate','rescue','first responders','ambulance','first aid']

words_specific = ['sunk','fire','derailed','collision','poisoned','burned']

In [43]:
texts = dataset['article']

num_articles = 2000
positive_articles = []
negative_articles = []

keywords = words_accident + words_damage + words_specific + ['safety']
keywords = [en_stemmer.stem(w) for w in keywords]

for i in tqdm(range(num_articles)):
    article = texts[i]
    words = set(dict(word_counter_text(article, stem=True, remove_stopwords=True).items()).keys())
    
    contains_keyword = 0
    for keyword in keywords:
        if keyword in words:
            contains_keyword += 1
    
    if contains_keyword >= 4:
        positive_articles.append(i)
    else:
        negative_articles.append(i)


100%|██████████| 2000/2000 [00:17<00:00, 112.79it/s]


In [44]:
print('positive:', len(positive_articles), 'negative:', len(negative_articles))

positive: 286 negative: 1714


In [46]:
for i in range(30):
    print('\n', dataset['highlights'][positive_articles[i]])




 NEW: "I thought I was going to die," driver says .
Man says pickup truck was folded in half; he just has cut on face .
Driver: "I probably had a 30-, 35-foot free fall"
Minnesota bridge collapsed during rush hour Wednesday .

 Aid workers: Violence, increased cost of living drive women to prostitution .
Group is working to raise awareness of the problem with Iraq's political leaders .
Two Iraqi mothers tell CNN they turned to prostitution to help feed their children .
"Everything is for the children," one woman says .

 Two cars loaded with gasoline and nails found abandoned in London Friday .
52 people killed on July 7, 2005 after bombs exploded on London bus, trains .
British capital wracked by violence by the IRA for years .

 Bomb victims waiting for presidential visit .
Blast went off 15 minutes before president's arrival .
Algeria faces Islamic insurgency .
Al Qaeda-affiliated group claimed July attacks .

 Boy on meeting Spider-Man: "It was my favorite thing"
Youssif also met 