# Following tutorial stackabuse

In [2]:
import numpy as np 
import pandas as pd 
import re
import nltk 
import matplotlib.pyplot as plt
from newspaper import Article
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 
%matplotlib inline

## Helper functions

In [3]:
# Filters the article of unneeded words / symbols
def filter_article(article):
    processed_article_text = []

    filter_words_and_symbols = ["\+", "de", "het", "een", ",", ".", "?"]

    for word in article.text.split():
        # Remove all the special characters
        processed_word = re.sub(r'\W', ' ', str(word))

        # remove all single characters
        processed_word = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_word)

        # Remove single characters from the start
        processed_word = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_word) 

        # Substituting multiple spaces with single space
        processed_word = re.sub(r'\s+', ' ', processed_word, flags=re.I)

        # Removing prefixed 'b'
        processed_word = re.sub(r'^b\s+', '', processed_word)

        # Converting to Lowercase
        processed_word = processed_word.lower()

        processed_article_text.append(processed_word)

        if processed_word in filter_words_and_symbols:
            processed_article_text.remove(processed_word)   
    
    return processed_article_text


## Gets articles and outputs words

In [4]:
urls =np.array (["https://www.nieuwsblad.be/cnt/dmf20190502_04368436",
        "https://www.nieuwsblad.be/cnt/dmf20190826_04575277",
        "https://www.vrt.be/vrtnws/nl/2018/08/04/tweede-hittegolf-is-een-feit/",
        "https://www.meteobelgie.be/klimatologie/waarnemingen-en-analyses/jaar-2017/2149-zom-2017",
        "https://www.standaard.be/cnt/dmf20190808_04550830",
        "https://www.vrt.be/vrtnws/nl/2019/07/23/waar-komt-de-hittegolf-vandaag/"
       ])
# loops over all the articles, filters and summarizes them

processed_texts = []

for link in urls:
    test_article = Article(link, language="nl", article_memorize = False)
    test_article.download()
    test_article.parse()
    processed_article_text = filter_article(test_article)
    processed_texts.append(processed_article_text)

print(processed_texts)


[['eerste', 'helft', 'van', 'mei', 'belooft', 'vooral', 'somber', 'weer ', 'maar', 'volgende', 'maanden', 'maken', 'alles', 'goed ', 'dat', 'zegt', 'meteobelgië ', 'dat', 'jongste', 'jaren', 'bewees', 'vaak', 'erg', 'nauwkeurig', 'weer', 'te', 'kunnen', 'voorspellen ', 'aan', 'kranten', 'van', 'sudpresse ', 'weerdienst', 'stelt', 'dat', 'we', 'bijzonder', 'warme', 'zomer', 'in', 'vooruitzicht', 'hebben ', 'boodschap', 'van', 'meteobelgië', 'is', 'duidelijk ', 'vertrouw', 'zonnige', 'en', 'warme', 'eerste', 'dag', 'van', 'mei', 'niet ', ' de', 'tendens', 'voor', 'eerste', 'tien', 'dagen ', 'tot', 'en', 'met', '12', 'mei ', 'wordt', 'koud', 'en', 'somber', 'weer ', 'zegt', 'philippe', 'mievis ', 'maar', 'voordat', 'je', 'daarom', 'begint', 'te', 'treuren ', 'volgens', 'weerdienst', 'van', 'mievis', 'worden', 'die', 'tien', 'dagen', 'enige', 'koude', 'van', 'komende', 'drie', 'maanden ', ' het', 'wordt', 'periode', 'vol', 'warm', 'weer', 'vanaf', 'midden', 'mei ', 'aldus', 'mievis ', ' we

## Gets vector array from words

In [5]:
for article in processed_texts:
    count = CountVectorizer()
    bag = count.fit_transform(article)
    print(bag.toarray())


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Assessing word relevancy

In [27]:
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(processed_texts))) 

AttributeError: 'list' object has no attribute 'toarray'