In [154]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math

In [155]:
data = pd.read_csv("../data/news.csv", sep = ";")

In [156]:
data.head()

Unnamed: 0,Datum,Titel,Text,Link,Quelle,Autor,count_words,count_data,count_capital_words,cleaned_text,origin,Autor.1
0,23.10.2020,Steirische Unis bauen trotz Coronakrise aus,"Und das sind die wichtigsten Projekte, die akt...",https://www.krone.at/2259121,Krone,Jörg Schwaiger,356,0,4,wichtigsten projekt aktuel graz leoben laufen ...,Austria,
1,14.12.2020,Immer mehr Firmen verlassen das Silicon Valley,Dort habe das Unternehmen bereits seinen größt...,https://www.krone.at/2297846,Krone,,199,0,2,unternehmen bereit größten beschäftigungsschwe...,Austria,
2,07.12.2020,Top-Waffenproduzenten nahmen fast 300 Mrd. € ein,Bei zwölf der 25 größten Rüstungskonzerne hand...,https://www.krone.at/2292763,Krone,,257,0,8,zwölf größten rüstungskonzern handelt unterneh...,Austria,
3,01.01.2021,Corona: Israel hat schon 1 Million Bürger geimpft,Eine Grafik auf der Website „Our World in Data...,https://www.krone.at/2309676,Krone,,336,0,1,grafik websit „our world data“ vergleicht vers...,Austria,
4,15.12.2020,Firmen informieren oft nicht über Mitarbeiterd...,Während nur rund die Hälfte der Unternehmen ih...,https://www.krone.at/2298543,Krone,,142,0,3,rund hälfte unternehmen mitarbeit speicherung ...,Austria,


# Preprocessing Functions

In [157]:
def convert_lower_case(data):
    return np.char.lower(data)

In [158]:
def remove_stop_words(data):
    stop_words = stopwords.words('german')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [159]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [160]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [161]:
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [162]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [163]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

In [164]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/retoheller/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Tokenisisierung

In [165]:
tokens = []

for i in range(len(data)):
    if type(data["cleaned_text"][i]) == float:
        tokens.append(["no"])
    else:
        tokens.append(word_tokenize(data["cleaned_text"][i]))

In [166]:
tokens_title = []

for i in range(len(data)):
    if type(data["Titel"][i]) == float:
        tokens_title.append(["no"])
    else:
        tokens_title.append(word_tokenize(data["Titel"][i]))

### Hinzufügen der Token des Textes und des Titels zum Dataframe

In [167]:
data["tokens"] = tokens
data["tokens_title"] = tokens_title

In [168]:
DF = {}

for i in range(len(tokens)):
    token = tokens[i]
    for w in token:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
            
for i in DF:
    DF[i] = len(DF[i])

### Totales Vokabular

In [169]:
total_vocab_size = len(DF)

In [170]:
total_vocab = [x for x in DF]

In [171]:
# Function for Doc Frequency
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

# TF-IDF für den Text Body

In [172]:
doc = 0
N = len(data)
tf_idf = {}

for i in range(N):
    
    tokens = data["tokens"][i]
    
    counter = Counter(tokens + data["tokens_title"][i])
    words_count = len(tokens + data["tokens_title"][i])
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1))
        
        tf_idf[doc, token] = tf*idf

    doc += 1

# TF-IDF für den Titel

In [173]:
doc = 0

tf_idf_title = {}

for i in range(N):
    
    tokens = data["tokens_title"][i]
    counter = Counter(tokens + data["tokens"][i])
    words_count = len(tokens + data["tokens"][i])

    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1)) #numerator is added 1 to avoid negative values
        
        tf_idf_title[doc, token] = tf*idf

    doc += 1

In [174]:
# Alpha
alpha = 0.3

# Create TF-IDF Dictionary

In [175]:
for i in tf_idf:
    tf_idf[i] *= alpha

In [176]:
for i in tf_idf_title:
    tf_idf[i] = tf_idf_title[i]

# Extract Top 5 Key Words per Article

In [276]:
values_total = []
token_total = []
docs_total = []

for i in range(len(data["cleaned_text"])):
    values=[]
    document=[]
    token = []
    for key in tf_idf:
        if key[0] == i:
            document.append(key[0])
            values.append(key[1])
            token.append(tf_idf[key])
        else:
            pass
            
    df = pd.DataFrame()
    df["values"] = values
    df["token"] = token
    df["doc"] = document
    df = df.sort_values(by='token', ascending=False)
    df = df.head()
    values_total.append(df["values"].to_list())
    token_total.append(df["token"].to_list())
    docs_total.append(df["doc"].to_list())

In [286]:
docs_ = []
values_1 = []
values_2 = []
values_3 = []
values_4 = []
values_5 = []

for i in range(len(values_total)):
    if len(values_total[i]) == 5:
        values_1.append(values_total[i][0])
        values_2.append(values_total[i][1])
        values_3.append(values_total[i][2])
        values_4.append(values_total[i][3])
        values_5.append(values_total[i][4])
    else:
        values_1.append(values_total[i][0])
        values_2.append("")
        values_3.append("")
        values_4.append("")
        values_5.append("")


### Spalten der Key Words zum Dataframe hinzufügen

In [298]:
data["key_word1"] = values_1
data["key_word2"] = values_2
data["key_word3"] = values_3
data["key_word4"] = values_4
data["key_word5"] = values_5

In [300]:
data.head()

Unnamed: 0,Datum,Titel,Text,Link,Quelle,Autor,count_words,count_data,count_capital_words,cleaned_text,origin,Autor.1,tokens,tokens_title,key_word1,key_word2,key_word3,key_word4,key_word5
0,23.10.2020,Steirische Unis bauen trotz Coronakrise aus,"Und das sind die wichtigsten Projekte, die akt...",https://www.krone.at/2259121,Krone,Jörg Schwaiger,356,0,4,wichtigsten projekt aktuel graz leoben laufen ...,Austria,,"[wichtigsten, projekt, aktuel, graz, leoben, l...","[Steirische, Unis, bauen, trotz, Coronakrise, ...",graz,Unis,Steirische,Coronakrise,leoben
1,14.12.2020,Immer mehr Firmen verlassen das Silicon Valley,Dort habe das Unternehmen bereits seinen größt...,https://www.krone.at/2297846,Krone,,199,0,2,unternehmen bereit größten beschäftigungsschwe...,Austria,,"[unternehmen, bereit, größten, beschäftigungss...","[Immer, mehr, Firmen, verlassen, das, Silicon,...",Firmen,Valley,Silicon,Immer,das
2,07.12.2020,Top-Waffenproduzenten nahmen fast 300 Mrd. € ein,Bei zwölf der 25 größten Rüstungskonzerne hand...,https://www.krone.at/2292763,Krone,,257,0,8,zwölf größten rüstungskonzern handelt unterneh...,Austria,,"[zwölf, größten, rüstungskonzern, handelt, unt...","[Top-Waffenproduzenten, nahmen, fast, 300, Mrd...",nahmen,€,Top-Waffenproduzenten,Mrd,300
3,01.01.2021,Corona: Israel hat schon 1 Million Bürger geimpft,Eine Grafik auf der Website „Our World in Data...,https://www.krone.at/2309676,Krone,,336,0,1,grafik websit „our world data“ vergleicht vers...,Austria,,"[grafik, websit, „, our, world, data, “, vergl...","[Corona, :, Israel, hat, schon, 1, Million, Bü...",israel,:,Million,Bürger,Corona
4,15.12.2020,Firmen informieren oft nicht über Mitarbeiterd...,Während nur rund die Hälfte der Unternehmen ih...,https://www.krone.at/2298543,Krone,,142,0,3,rund hälfte unternehmen mitarbeit speicherung ...,Austria,,"[rund, hälfte, unternehmen, mitarbeit, speiche...","[Firmen, informieren, oft, nicht, über, Mitarb...",Mitarbeiterdaten,Firmen,über,betriebsvereinbarungen,ak


# Ranking using Matching Score

In [290]:
def matching_score(k, query):
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))

    print("Matching Score")
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    query_weights = {}

    for key in tf_idf:
        
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)

    print("")
    
    l = []
    
    for i in query_weights[:10]:
        l.append(i[0])
    
    print(l)
    

matching_score(10, "Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying")

Matching Score

Query: Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying

['without', 'the', 'drive', 'of', 'rebeccah', 'insist', 'kate', 'lost', 'her', 'momentum', 'she', 'stood', 'next', 'slat', 'oak', 'bench', 'canist', 'still', 'clutch', 'survey']

[345, 184, 362, 514, 175, 495, 961, 333, 8, 57]


# Ranking Cosine Similarity

In [291]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim


In [292]:
D = np.zeros((N, total_vocab_size))
for i in tf_idf:
    try:
        ind = total_vocab.index(i[1])
        D[i[0]][ind] = tf_idf[i]
    except:
        pass

In [293]:
def gen_vector(tokens):

    Q = np.zeros((len(total_vocab)))
    
    counter = Counter(tokens)
    words_count = len(tokens)

    query_weights = {}
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = math.log((N+1)/(df+1))

        try:
            ind = total_vocab.index(token)
            Q[ind] = tf*idf
        except:
            pass
    return Q

In [294]:
def cosine_similarity(k, query):
    print("Cosine Similarity")
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))
    
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    d_cosines = []
    
    query_vector = gen_vector(tokens)
    
    for d in D:
        d_cosines.append(cosine_sim(query_vector, d))
        
    out = np.array(d_cosines).argsort()[-k:][::-1]
    
    print("")
    
    print(out)

#     for i in out:
#         print(i, dataset[i][0])

Q = cosine_similarity(10, "Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying")

Cosine Similarity

Query: Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying

['without', 'the', 'drive', 'of', 'rebeccah', 'insist', 'kate', 'lost', 'her', 'momentum', 'she', 'stood', 'next', 'slat', 'oak', 'bench', 'canist', 'still', 'clutch', 'survey']

[ 184  362  345  619  961  514  175  895 1003  542]
