In [1]:
def load_text(path):
    text = ''
    with open(path) as f:
        text = f.read().replace('\n', ' ')
    return text

In [2]:
def put_text_in_lowercase(text):
    splited_text = text.split()
    lowercase_text = ' '.join([word.lower() for word in splited_text])
    return lowercase_text

In [3]:
def tokenize(text, tokenizer):
    tokens = tokenizer.tokenize(text)
    return tokens

In [4]:
def remove_stop_words(tokenized_text, stop_words_list):
    text_without_stop_words = [token for token in tokenized_text if token not in stop_words_list]
    return text_without_stop_words

In [15]:
def get_stems(tokens, stemmer):
    stems = [stemmer.stem(token) for token in tokens]
    return stems

In [6]:
def get_frequency(words):
    vocab = set(words)
    freq = {}
    for word in vocab:
        freq[word] = words.count(word)
    return freq

In [8]:
text = load_text('data/news_article.txt')
text



In [9]:
lowercase_text = put_text_in_lowercase(text)
lowercase_text



In [11]:
from nltk.tokenize import WhitespaceTokenizer
whitespace_tokenizer = WhitespaceTokenizer()
tokenized_text = tokenize(lowercase_text, whitespace_tokenizer)
tokenized_text

['ever',
 'since',
 'the',
 'populist',
 'law',
 'and',
 'justice',
 '(pis)',
 'party',
 'took',
 'power',
 'in',
 '2015,',
 'adam',
 'bodnar,',
 'poland’s',
 'human-rights',
 'ombudsman,',
 'has',
 'been',
 'against',
 'its',
 'relentless',
 'efforts',
 'to',
 'get',
 'control',
 'of',
 'the',
 'courts.',
 'to',
 'illustrate',
 'the',
 'danger,',
 'he',
 'uses',
 'an',
 'expression',
 'from',
 'communist',
 'times:',
 'lex',
 'telefonica.',
 'in',
 'the',
 'polish',
 'people’s',
 'republic,',
 'verdicts',
 'were',
 'routinely',
 'dictated',
 'by',
 'a',
 'phone',
 'call',
 'from',
 'an',
 'apparatchik',
 'at',
 'party',
 'headquarters.',
 'today’s',
 'government',
 'has',
 'more',
 'subtle',
 'techniques,',
 'but',
 'the',
 'goal',
 'is',
 'the',
 'same,',
 'mr',
 'bodnar',
 'says:',
 '“if',
 'a',
 'judge',
 'has',
 'a',
 'case',
 'on',
 'his',
 'desk',
 'with',
 'some',
 'political',
 'importance,',
 'he',
 'should',
 'be',
 'afraid.”',
 'the',
 'european',
 'commission',
 'is',
 'wo

In [13]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
clean_text = remove_stop_words(tokenized_text, stop_words)
clean_text

['ever',
 'since',
 'populist',
 'law',
 'justice',
 '(pis)',
 'party',
 'took',
 'power',
 '2015,',
 'adam',
 'bodnar,',
 'poland’s',
 'human-rights',
 'ombudsman,',
 'relentless',
 'efforts',
 'get',
 'control',
 'courts.',
 'illustrate',
 'danger,',
 'uses',
 'expression',
 'communist',
 'times:',
 'lex',
 'telefonica.',
 'polish',
 'people’s',
 'republic,',
 'verdicts',
 'routinely',
 'dictated',
 'phone',
 'call',
 'apparatchik',
 'party',
 'headquarters.',
 'today’s',
 'government',
 'subtle',
 'techniques,',
 'goal',
 'same,',
 'mr',
 'bodnar',
 'says:',
 '“if',
 'judge',
 'case',
 'desk',
 'political',
 'importance,',
 'afraid.”',
 'european',
 'commission',
 'worried,',
 'too.',
 'accuses',
 'pis',
 'violating',
 'poland’s',
 'commitments',
 'rule',
 'law',
 'european',
 'union’s',
 'founding',
 'treaty.',
 '2017',
 'commission',
 'took',
 'poland',
 'european',
 'court',
 'justice',
 '(ecj)',
 'laws',
 'gave',
 'politicians',
 'control',
 'appointing',
 'judges.',
 '(for',
 '

In [17]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
stems = get_stems(clean_text, porter_stemmer)
stems

['ever',
 'sinc',
 'populist',
 'law',
 'justic',
 '(pis)',
 'parti',
 'took',
 'power',
 '2015,',
 'adam',
 'bodnar,',
 'poland’',
 'human-right',
 'ombudsman,',
 'warn',
 'relentless',
 'effort',
 'get',
 'control',
 'courts.',
 'illustr',
 'danger,',
 'use',
 'express',
 'communist',
 'times:',
 'lex',
 'telefonica.',
 'polish',
 'people’',
 'republic,',
 'verdict',
 'routin',
 'dictat',
 'phone',
 'call',
 'apparatchik',
 'parti',
 'headquarters.',
 'today’',
 'govern',
 'subtl',
 'techniques,',
 'goal',
 'same,',
 'mr',
 'bodnar',
 'says:',
 '“if',
 'judg',
 'case',
 'desk',
 'polit',
 'importance,',
 'afraid.”',
 'european',
 'commiss',
 'worried,',
 'too.',
 'accus',
 'pi',
 'violat',
 'poland’',
 'commit',
 'rule',
 'law',
 'european',
 'union’',
 'found',
 'treaty.',
 '2017',
 'commiss',
 'took',
 'poland',
 'european',
 'court',
 'justic',
 '(ecj)',
 'law',
 'gave',
 'politician',
 'control',
 'appoint',
 'judges.',
 '(for',
 'example,',
 'lower',
 'judges’',
 'retir',
 'age'

In [18]:
get_frequency(stems)

{'age': 1,
 'mr': 1,
 'danger,': 1,
 'subtl': 1,
 'let': 1,
 'bodnar,': 1,
 'poland’': 2,
 'too.': 1,
 'pick': 1,
 '2015,': 1,
 '(for': 1,
 'worried,': 1,
 'judges.': 1,
 'times:': 1,
 'gave': 1,
 'use': 1,
 'get': 1,
 'case': 1,
 'appoint': 1,
 'ecj': 1,
 'techniques,': 1,
 'desk': 1,
 'law': 3,
 'ombudsman,': 1,
 'retir': 1,
 'found': 1,
 'polit': 1,
 'ever': 1,
 'judg': 1,
 '“if': 1,
 'accus': 1,
 'people’': 1,
 'judges’': 1,
 'express': 1,
 'goal': 1,
 'same,': 1,
 'illustr': 1,
 'minist': 1,
 'govern': 1,
 'routin': 1,
 'importance,': 1,
 'headquarters.': 1,
 'european': 3,
 'lex': 1,
 'parti': 2,
 'scrap': 1,
 'verdict': 1,
 'poles,': 1,
 'meantim': 1,
 'measures.': 1,
 'phone': 1,
 'bodnar': 1,
 'adam': 1,
 'populist': 1,
 'polish': 1,
 'effort': 1,
 'republic,': 1,
 'treaty.': 1,
 'power': 1,
 'today’': 1,
 'lower': 1,
 'says:': 1,
 'afraid.”': 1,
 'commit': 1,
 'relentless': 1,
 'telefonica.': 1,
 'human-right': 1,
 'warn': 1,
 'commiss': 2,
 'politician': 1,
 'pi': 1,
 '(pis)