# Filtering dataset by keywords

In [1]:
# imports
from datasets import load_dataset
import nltk
import itertools
from collections import Counter
from tqdm import tqdm
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords
import re
import random

In [2]:
# deze cel downloadet de dataset
# hij gebruikt het script cnn_dailymail.py van huggingface:
# https://huggingface.co/datasets/cnn_dailymail/tree/main
# dit kan wel iets van 10 minuten duren, ga maar wat koffie halen

dataset = load_dataset("cnn_dailymail.py", "3.0.0", split="train") 

Reusing dataset cnn_dailymail (/Users/tppl/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


In [3]:
# print de features en het aantal datapunten in de dataset
print(dataset.features)
print(dataset.num_rows)

{'article': Value(dtype='string', id=None), 'highlights': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None)}
287113


In [4]:
# methoden om de data te pre-processen
en_stemmer = EnglishStemmer() # stemmer voor engelse woorden
nltk.download('stopwords') # stopwoorden die niet veel waarde toevoegen
stop_words = set(stopwords.words('english'))
alph_string_pattern = re.compile("[a-zA-Z]") # filtert 'woorden' die niet beginnen met een letter, zoals interpunctietokens


def word_counter_text(text: str, stem=False, remove_stopwords=False):
    """
    Neemt als input een string tekst
    Returnt een Counter object die alle woorden uit de tekst telt
    """
    
    # splits de tekst op in een lijst van woorden
    sents = nltk.tokenize.sent_tokenize(text)
    words = [nltk.word_tokenize(sent) for sent in sents]
    flatten_words = list(itertools.chain(*words))
    
    # woorden stemmen of alleen maar hoofdletters weghalen
    if stem:
        flatten_lower_words = [en_stemmer.stem(str) for str in flatten_words]
    else:
        flatten_lower_words = [str.lower() for str in flatten_words]
    
    # stopwoorden weghalen
    if remove_stopwords:
        flatten_lower_words = [str for str in flatten_lower_words if str not in stop_words]
        
    # tokens die niet beginnen met een letter weghalen
    flatten_lower_words = [str for str in flatten_lower_words if alph_string_pattern.match(str)]
    
    return Counter(flatten_lower_words)

[nltk_data] Downloading package stopwords to /Users/tppl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Tot nu toe wat Erik heeft gedaan
vanaf hier filteren we onze dataset met een bepaalde keywords

### Wij beginnen met de makkelijkste taak namelijk alle totaal verschillend artikelen uithalen
en daarna gaan wij nog verder filteren

In [5]:
# een lijst van keywords bedacht door Coen

words_accident = ['accident','disaster','catastrophe','incident','near-miss', 'tragedy']

words_damage = ['victim','casualties','died','killed','damage','harm','hospital','hospitalized',
                'wounded','succumbed','unscathed','evacuate','rescue','first responders','ambulance','first aid']

words_specific = ['sunk','fire','derailed','collision','poisoned','burned']

In [6]:
def filter1(texts, num_articles, keywords, threshold):
    positive_articles = []
    negative_articles = []
    
    for i in tqdm(num_articles):
        # haal alle unieke woorden van het artikel
        article = texts[i]
        words = set(dict(word_counter_text(article, stem=True, remove_stopwords=True).items()).keys())

        # kijk hoeveel keywords er voorkomen
        contains_keyword = 0
        for keyword in keywords:
            if keyword in words:
                contains_keyword += 1

        if contains_keyword >= threshold:
            positive_articles.append(i)
        else:
            negative_articles.append(i)

    return positive_articles, negative_articles
    

In [7]:
# parameters instelling
texts = dataset['article']
num_articles = list(range(2000))
keywords = words_accident + words_damage + words_specific + ['safety']
# gestemde versie van keywords
keywords = [en_stemmer.stem(w) for w in keywords]
# drempelwarde voor hoeveel keywords een artikel moet bevatten
threshold = 3

In [8]:
filter1_1 = filter1(texts, num_articles, keywords, threshold)

100%|██████████| 2000/2000 [00:40<00:00, 49.38it/s]


In [9]:
print('positive:', len(filter1_1[0]), 'negative:', len(filter1_1[1]))

positive: 482 negative: 1518


### Omdat er geen label bestaat moeten wij handmatig controleren of het goed gefilterd is.
#### Het doel van dit gedeelte is alle onpassende artikelen uithalen 

In [10]:
for i in range(30):
    print(dataset['highlights'][filter1_1[0][i]], '\n')

NEW: "I thought I was going to die," driver says .
Man says pickup truck was folded in half; he just has cut on face .
Driver: "I probably had a 30-, 35-foot free fall"
Minnesota bridge collapsed during rush hour Wednesday . 

Parents beam with pride, can't stop from smiling from outpouring of support .
Mom: "I was so happy I didn't know what to do"
Burn center in U.S. has offered to provide treatment for reconstructive surgeries .
Dad says, "Anything for Youssif" 

Aid workers: Violence, increased cost of living drive women to prostitution .
Group is working to raise awareness of the problem with Iraq's political leaders .
Two Iraqi mothers tell CNN they turned to prostitution to help feed their children .
"Everything is for the children," one woman says . 

Two cars loaded with gasoline and nails found abandoned in London Friday .
52 people killed on July 7, 2005 after bombs exploded on London bus, trains .
British capital wracked by violence by the IRA for years . 

NEW: President B

## Resultaat filter1 op positive_articles
Van de 30 artikelen 11 waren gerelateerd tot ongevallen en 2 gingen specifiek over werkplaats ongevallen. Dus 19 van 30 was foutief als positief beschouwd.

In [11]:
# check of negative_articles ook passende artikelen heeft
for i in range(30):
    print(dataset['highlights'][filter1_1[1][i]],'\n')

Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund . 

Mentally ill inmates in Miami are housed on the "forgotten floor"
Judge Steven Leifman says most are there as a result of "avoidable felonies"
While CNN tours facility, patient shouts: "I am the son of the president"
Leifman says the system is unjust and he's fighting for change . 

Five small polyps found during procedure; "none worrisome," spokesman says .
President reclaims powers transferred to vice president .
Bush undergoes routine colonoscopy at Camp David . 

NEW: NFL chief, Atlanta Falcons owner critical of Michael Vick's conduct .
NFL suspends Falcons quarterback indefinitely without pay .
Vick admits funding dogfighting operation but says he did not gamble .
Vick due in federal court Monday; future in NFL remains uncertain . 

Tomas Medina Caracas was a fugit

## Resultaat filter1 op negative_articles
Van de 30 artikelen 1 was gerelateerd tot ongevallen en 0 ging specifiek werkplaats ongevallen. Dus filter1 werkt best wel goed voor het filteren van ongerelateerde artikelen. Dus het doel is bereikt?

# Sub Conclusie
### filter1 kan heel goed ongerelateerde artikelen weghalen maar het presteert minder goed bij het uithalen van artikelen die over ongevallen gaan.

#### filter1 kan totaal ongerelateerde artikelen goed onderscheiden en weghalen. Nu is de vraag kan het nog beter doen bij het uithalen van artikelen die over ongelukken gaan. Om dit te doen halen wij artikelen eruit die over oorlog of terreur (externe factoren) gaan op basis van keywords.

In [12]:
# keywords gehaald van de bovenstannde resultaten sie externe factoren aanduiden
indicates_external = ['Gunman', 'suspect', 'attack', 'terror', 'war', 'bombing', 'militants', 'bombs',
                     'conflict', 'hijacking']

In [13]:
# parameters instelling
num_articles = filter1_1[0]
keywords = indicates_external
# gestemde versie van keywords
keywords = [en_stemmer.stem(w) for w in keywords]
# drempelwarde voor hoeveel keywords een artikel moet bevatten
threshold = 3

In [14]:
filter1_3 = filter1(texts, num_articles, keywords, threshold)

100%|██████████| 482/482 [00:11<00:00, 43.29it/s]


In [15]:
# hier is de positive externe factoren dus negative is wat wij willen
print('positive:', len(filter1_3[0]), 'negative:', len(filter1_3[1]))

positive: 96 negative: 386


## Als het goed gaat, moeten wij voor positive_articles artikelen krijgen die over oorlog, terrueraanslag enz gaan. Negative_articles is kleiner geworden dus dat zal ons tijd besparen bij wanneer wij de data gaan labelen.

In [16]:
for i in range(5):
    print(dataset['highlights'][filter1_3[0][i]], '\n')

Two cars loaded with gasoline and nails found abandoned in London Friday .
52 people killed on July 7, 2005 after bombs exploded on London bus, trains .
British capital wracked by violence by the IRA for years . 

Bomb victims waiting for presidential visit .
Blast went off 15 minutes before president's arrival .
Algeria faces Islamic insurgency .
Al Qaeda-affiliated group claimed July attacks . 

Boy, 12, lost leg to car bombing in Iraq; cousin was killed by blast .
He is now heading to the United States to get a prosthetic leg .
Boy: "I want to ... meet this person that gave me my future back"
Charity director says helping the boy makes her "cry with joy" 

Taliban militants kill Australian commando in southern Afghanistan .
He was shot during operation to clear Taliban bomb making facility in Uruzgan .
This is the fourth Australian soldier to die in the conflict in Afghanistan .
Several militants killed and a coalition troop injured in other fighting in southeast . 

NEW: Teen gunma

In [17]:
for i in range(5):
    print(dataset['highlights'][filter1_3[1][i]], '\n')

NEW: "I thought I was going to die," driver says .
Man says pickup truck was folded in half; he just has cut on face .
Driver: "I probably had a 30-, 35-foot free fall"
Minnesota bridge collapsed during rush hour Wednesday . 

Parents beam with pride, can't stop from smiling from outpouring of support .
Mom: "I was so happy I didn't know what to do"
Burn center in U.S. has offered to provide treatment for reconstructive surgeries .
Dad says, "Anything for Youssif" 

Aid workers: Violence, increased cost of living drive women to prostitution .
Group is working to raise awareness of the problem with Iraq's political leaders .
Two Iraqi mothers tell CNN they turned to prostitution to help feed their children .
"Everything is for the children," one woman says . 

NEW: President Bush says he and first lady are deeply saddened by the tragedy .
Mine Safety and Health Administration chief: We've run out of options.
The six men have been trapped underground since August 6 .
Seven bore holes dri

### Nu weten wij over de eerste 30 artikelen. Wat nou als de artikelen willekeurig gekozen zijn? Krijgen wij nog steeds hetzelfde resultaat? namelijk 100% accuracy voor negatieve_articles.

In [18]:
# parameters instelling
texts = dataset['article']
num_articles = random.sample(range(0, dataset.num_rows), 2000)
keywords = words_accident + words_damage + words_specific + ['safety']
# gestemde versie van keywords
keywords = [en_stemmer.stem(w) for w in keywords]
# drempelwarde voor hoeveel keywords een artikel moet bevatten
threshold = 3

In [19]:
filter1_2 = filter1(texts, num_articles, keywords, threshold)

100%|██████████| 2000/2000 [00:50<00:00, 39.80it/s]


In [20]:
for i in range(30):
    print(dataset['highlights'][filter1_2[0][i]], '\n')

751 infants and 43 health care workers have been exposed to TB .
Hospital worker in El Paso, Texas, came to work with active case of TB .
TB is an infectious disease, but not highly contagious . 

Gang of 11 boys allegedly beat to death Zhang Tai, eight, in Hebei province .
They reportedly left youngster's battered and bruised body lying near road .
Tai was taken to hospital with severe head injuries, but later declared dead .
Today, police said gang admitted killing classmate because they were bored . 

Dr David Madden wrote several offensive things about special needs students .
Also said that special ed students got 'more rights than the innocents'
Said bipolar student was 'scary' and had a future as the next John Wilkes Booth or Lee Harvey Oswald . 

Elvis Lee, 34, kicked and BIT defenceless Tyler Whelan in savage attack .
Tyler's mother Stephanie found guilty of causing or allowing his death .
Couple failed to take Tyler to hospital for FOUR HOURS despise his horrific injuries . 


Said failure to rescue girls needed an 'international and domestic response'
Former prime minister said Britain should support call for better intelligence .
The 200 schoolgirls were kidnapped by terrorist group Boko Haram in May . 



In [21]:
for i in range(30):
    print(dataset['highlights'][filter1_2[1][i]], '\n')

The images, which were taken in August 1961, will go under the hammer on February 19 .
In October last year, a set of original negatives from the Kennedys' wedding was auctioned off for $34,000 . 

The children's parents say they felt they were exploited and underpaid for their work .
The kids reportedly still live in the same "tangle of makeshift shacks" in Mumbai .
Director says the boys were paid "way over and above what you could pay" 

Atlanta Falcons owner Arthur Blank admitted wrongdoing Tuesday amid the NFL's investigation into whether the team used fake crowd noise .
Blank: 'I think what we've done in 2013 and 2014 was wrong'
If found to have used fake crowd noise, the team could face a fine or lose a draft pick . 

Dutch club PSV Eindhoven agree to sell winger Ibrahim Afellay to Barcelona .
The 24-year-old will settle personal terms ahead of January transfer window .
He has played 30 times for the Netherlands and went to the 2010 World Cup . 

Taxpayers' money spent on bailin

Monarch writes secret message for athletes and places it inside the baton .
Ceremonial object brought to Buckingham Palace by Sir Chris Hoy .
Queen hands it over to Allan Wells for the start of its journey to every Commonwealth country beginning with India .
Baton will return to UK in time for Glasgow opening ceremony in July 2014 . 



In [22]:
# Nu op random
# parameters instelling
num_articles = filter1_2[0]
keywords = indicates_external
# gestemde versie van keywords
keywords = [en_stemmer.stem(w) for w in keywords]
# drempelwarde voor hoeveel keywords een artikel moet bevatten
threshold = 3

In [23]:
filter1_4 = filter1(texts, num_articles, keywords, threshold)

100%|██████████| 443/443 [00:12<00:00, 36.73it/s]


In [24]:
for i in range(5):
    print(dataset['highlights'][filter1_4[0][i]], '\n')

Jaffar Deghayes disappeared from Brighton with brother Abdullah .
Abdullah later died while fighting alongside Islamic rebel forces in Syria .
Number of arrests in first quarter of year is up from 25 in whole of last year .
Around 400 Britons are believed to have gone to Syria over last two years .
Scotland Yard launching bid to stop young people going to war zone . 

ISIS have beheaded a group of Syrian soldiers following night time attack .
Images show the men pleading for the lives before being decapitated .
It comes amid the release of yet another ISIS propaganda video .
The video purports to show the jihadis shooting down a fighter jet .
Extremists seen operating anti-aircraft gun and firing at Syrian soldiers . 

NEW: At least eight Palestinian militants were killed in Gaza airstrikes, Hamas sources say .
Aunt of beaten teen says he doesn't really understand what's going on, is scared .
Mother of dead Palestinian teen wants Israel to raze suspects' homes .
The boy's cousin was re

In [25]:
for i in range(5):
    print(dataset['highlights'][filter1_4[1][i]], '\n')

751 infants and 43 health care workers have been exposed to TB .
Hospital worker in El Paso, Texas, came to work with active case of TB .
TB is an infectious disease, but not highly contagious . 

Gang of 11 boys allegedly beat to death Zhang Tai, eight, in Hebei province .
They reportedly left youngster's battered and bruised body lying near road .
Tai was taken to hospital with severe head injuries, but later declared dead .
Today, police said gang admitted killing classmate because they were bored . 

Dr David Madden wrote several offensive things about special needs students .
Also said that special ed students got 'more rights than the innocents'
Said bipolar student was 'scary' and had a future as the next John Wilkes Booth or Lee Harvey Oswald . 

Elvis Lee, 34, kicked and BIT defenceless Tyler Whelan in savage attack .
Tyler's mother Stephanie found guilty of causing or allowing his death .
Couple failed to take Tyler to hospital for FOUR HOURS despise his horrific injuries . 


# Het lijkt soort van goed te werken. Dit bespaart onze tijd enorm wanneer wij de dataset gaan labelen.
## Wij gaan nu 10000 data filteren.

In [30]:
# parameters instelling
num_articles = list(range(10000))
keywords = words_accident + words_damage + words_specific + ['safety']
# gestemde versie van keywords
keywords = [en_stemmer.stem(w) for w in keywords]
# drempelwarde voor hoeveel keywords een artikel moet bevatten
threshold = 3

In [31]:
filter1_final1 = filter1(texts, num_articles, keywords, threshold)

100%|██████████| 10000/10000 [03:07<00:00, 53.37it/s]


In [32]:
print('The dataset after filtering completely irrelevant articles.')
print('positive:', len(filter1_final1[0]), 'negative:', len(filter1_final1[1]))

The dataset after filtering completely irrelevant articles.
positive: 2434 negative: 7566


In [33]:
# parameters instelling
num_articles = filter1_final1[0]
keywords = indicates_external
# gestemde versie van keywords
keywords = [en_stemmer.stem(w) for w in keywords]
# drempelwarde voor hoeveel keywords een artikel moet bevatten
threshold = 3

In [34]:
filter1_final2 = filter1(texts, num_articles, keywords, threshold)

100%|██████████| 2434/2434 [00:50<00:00, 47.83it/s]


In [37]:
print('The dataset after filtering war and terrorism related articles.')
print('positive:', len(filter1_final2[0]), 'negative:', len(filter1_final2[1]))
print(f'So in the end we have {len(filter1_final2[1])} to label out of 10k data.')

The dataset after filtering war and terrorism related articles.
positive: 417 negative: 2017
So in the end we have 2017 to label out of 10k data.
