In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math

In [2]:
data = pd.read_csv("../data/news.csv", sep = ";")

In [3]:
data.head()

Unnamed: 0,Datum,Titel,Text,Link,Quelle,Autor,count_words,count_data,count_capital_words,cleaned_text,origin,Autor.1
0,23.10.2020,Steirische Unis bauen trotz Coronakrise aus,"Und das sind die wichtigsten Projekte, die akt...",https://www.krone.at/2259121,Krone,Jörg Schwaiger,356,0,4,wichtigsten projekt aktuel graz leoben laufen ...,Austria,
1,14.12.2020,Immer mehr Firmen verlassen das Silicon Valley,Dort habe das Unternehmen bereits seinen größt...,https://www.krone.at/2297846,Krone,,199,0,2,unternehmen bereit größten beschäftigungsschwe...,Austria,
2,07.12.2020,Top-Waffenproduzenten nahmen fast 300 Mrd. € ein,Bei zwölf der 25 größten Rüstungskonzerne hand...,https://www.krone.at/2292763,Krone,,257,0,8,zwölf größten rüstungskonzern handelt unterneh...,Austria,
3,01.01.2021,Corona: Israel hat schon 1 Million Bürger geimpft,Eine Grafik auf der Website „Our World in Data...,https://www.krone.at/2309676,Krone,,336,0,1,grafik websit „our world data“ vergleicht vers...,Austria,
4,15.12.2020,Firmen informieren oft nicht über Mitarbeiterd...,Während nur rund die Hälfte der Unternehmen ih...,https://www.krone.at/2298543,Krone,,142,0,3,rund hälfte unternehmen mitarbeit speicherung ...,Austria,


# Preprocessing Functions

In [4]:
def convert_lower_case(data):
    return np.char.lower(data)

In [5]:
def remove_stop_words(data):
    stop_words = stopwords.words('german')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [6]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [7]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [8]:
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [9]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [10]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

In [11]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/retoheller/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Tokenisisierung

In [12]:
tokens = []

for i in range(len(data)):
    if type(data["cleaned_text"][i]) == float:
        tokens.append(["no"])
    else:
        tokens.append(word_tokenize(data["cleaned_text"][i]))

In [13]:
tokens_title = []

for i in range(len(data)):
    if type(data["Titel"][i]) == float:
        tokens_title.append(["no"])
    else:
        tokens_title.append(word_tokenize(data["Titel"][i]))

### Hinzufügen der Token des Textes und des Titels zum Dataframe

In [14]:
data["tokens"] = tokens
data["tokens_title"] = tokens_title

# Installieren eines POS-Taggers für die deutsche Sprache

In [15]:
import nltk
#nltk.download('averaged_perceptron_tagger')
corp = nltk.corpus.ConllCorpusReader('.', 'tiger_release_aug07.corrected.16012013.conll09',
                                     ['ignore', 'words', 'ignore', 'ignore', 'pos'],
                                     encoding='utf-8')

In [16]:
import random

tagged_sents = list(corp.tagged_sents())
random.shuffle(tagged_sents)

# set a split size: use 90% for training, 10% for testing
split_perc = 0.1
split_size = int(len(tagged_sents) * split_perc)
train_sents, test_sents = tagged_sents[split_size:], tagged_sents[:split_size]

In [17]:
# -*- coding: utf-8 -*-
#
# Natural Language Toolkit: ClassifierBasedGermanTagger
#
# URL: <http://www.experimentallabor.de/>
#
# Copyright 2011 Philipp Nolte
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Tag German text.
"""

import re
from nltk.tag.sequential import ClassifierBasedTagger

class ClassifierBasedGermanTagger(ClassifierBasedTagger):
    """A classifier based German part-of-speech tagger. It has an accuracy of
    96.09% after being trained on 90% of the German TIGER corpus. The tagger
    extends the NLTK ClassifierBasedTagger and implements a slightly modified
    feature detector.
    """

    def feature_detector(self, tokens, index, history):
        """Implementing a slightly modified feature detector.
        @param tokens: The tokens from the sentence to tag.
        @param index: The current token index to tag.
        @param history: The previous tagged tokens.
        """

        word = tokens[index]
        if index == 0: # At the beginning of the sentence
            prevword = prevprevword = None
            prevtag = prevprevtag = None
            #word = word.lower() # Lowercase at the beginning of sentence
        elif index == 1:
            prevword = tokens[index-1] # Note: no lowercase
            prevprevword = None
            prevtag = history[index-1]
            prevprevtag = None
        else:
            prevword = tokens[index-1]
            prevprevword = tokens[index-2]
            prevtag = history[index-1]
            prevprevtag = history[index-2]

        if re.match('[0-9]+([\.,][0-9]*)?|[0-9]*[\.,][0-9]+$', word):
            # Included "," as decimal point
            shape = 'number'
        elif re.compile('\W+$', re.UNICODE).match(word):
            # Included unicode flag
            shape = 'punct'
        elif re.match('([A-ZÄÖÜ]+[a-zäöüß]*-?)+$', word):
            # Included dash for dashed words and umlauts
            shape = 'upcase'
        elif re.match('[a-zäöüß]+', word):
            # Included umlauts
            shape = 'downcase'
        elif re.compile("\w+", re.UNICODE).match(word):
            # Included unicode flag
            shape = 'mixedcase'
        else:
            shape = 'other'

        features = {
            'prevtag': prevtag,
            'prevprevtag': prevprevtag,
            'word': word,
            'word.lower': word.lower(),
            'suffix3': word.lower()[-3:],
            #'suffix2': word.lower()[-2:],
            #'suffix1': word.lower()[-1:],
            'preffix1': word[:1], # included
            'prevprevword': prevprevword,
            'prevword': prevword,
            'prevtag+word': '%s+%s' % (prevtag, word),
            'prevprevtag+word': '%s+%s' % (prevprevtag, word),
            'prevword+word': '%s+%s' % (prevword, word),
            'shape': shape
            }
        return features

In [18]:
tagger = ClassifierBasedGermanTagger(train=train_sents)

### Tokenisierung auf den Rohtexten durchführen

In [19]:
for i in range(len(data)):
    data["Text"][i] = convert_lower_case(str(data["Text"][i]))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Text"][i] = convert_lower_case(str(data["Text"][i]))


In [20]:
for i in range(len(data)):
    data["Text"][i] = remove_punctuation(str(data["Text"][i]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Text"][i] = remove_punctuation(str(data["Text"][i]))


In [21]:
for i in range(len(data)):
    data["Text"][i] = remove_apostrophe(str(data["Text"][i]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Text"][i] = remove_apostrophe(str(data["Text"][i]))


In [22]:
for i in range(len(data)):
    data["Text"][i] = convert_numbers(str(data["Text"][i]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Text"][i] = convert_numbers(str(data["Text"][i]))


In [23]:
# Test tokenisieren
tokens_uncleaned = []

for i in range(len(data)):
    if type(data["Text"][i]) == float:
        tokens_uncleaned.append(["no"])
    else:
        tokens_uncleaned.append(word_tokenize(str(data["Text"][i])))

In [24]:
data["tokens_uncleaned"] = tokens_uncleaned

In [25]:
white_list_tags = ["NN","NNP","NE","ADJA"]
matched_tokens_total = []

for i in range(len(data["tokens_uncleaned"])):
    tagged_tokens = tagger.tag(data["tokens_uncleaned"][i])
    matched_tokens = []
    for i in range(len(tagged_tokens)):
        if tagged_tokens[i][1] in white_list_tags:
            matched_tokens.append(tagged_tokens[i][0])
        else:
            pass
    
    matched_tokens_total.append(matched_tokens)
    

In [106]:
#matched_tokens_total[1]

In [26]:
data["tagged_tokens"] = matched_tokens_total

In [27]:
# Titel tokenisieren
titels_uncleaned = []

for i in range(len(data)):
    if type(data["Titel"][i]) == float:
        titels_uncleaned.append(["no"])
    else:
        titels_uncleaned.append(word_tokenize(data["Titel"][i]))

In [28]:
data["titel_uncleaned"] = titels_uncleaned

In [29]:
white_list_tags = ["NN","NNP","NE","ADJA"]
matched_titels_total = []

for i in range(len(data["titel_uncleaned"])):
    tagged_titels = tagger.tag(data["titel_uncleaned"][i])
    matched_titels = []
    for i in range(len(tagged_titels)):
        if tagged_titels[i][1] in white_list_tags:
            matched_titels.append(tagged_titels[i][0])
        else:
            pass
    
    matched_titels_total.append(matched_titels)

In [30]:
data["tagged_titels"] = matched_titels_total

In [28]:
data.to_csv("../data/news_tagged.csv",index=False,sep=";")

In [31]:
DF = {}

for i in range(len(matched_tokens_total)):
    token = matched_tokens_total[i]
    for w in token:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
            
for i in DF:
    DF[i] = len(DF[i])

### Totales Vokabular

In [32]:
total_vocab_size = len(DF)

In [33]:
total_vocab = [x for x in DF]

In [34]:
# Function for Doc Frequency
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

# TF-IDF für den Text Body

In [50]:
#data = pd.read_csv("../data/news_tagged.csv", sep = ";")

In [117]:
#matched_tokens_total = data["tagged_tokens"] 

In [35]:
total_tagged_tokens = []
for i in range(len(matched_tokens_total)):
    for j in range(len(matched_tokens_total[i])):
        total_tagged_tokens.append(matched_tokens_total[i][j])
        

In [37]:
doc = 0
N = len(data)
tf_idf = {}

for i in range(N):
    
    tokens = data["tagged_tokens"][i]
    
    counter = Counter(tokens + matched_titels_total[i])
    words_count = len(tokens + matched_titels_total[i])
    print(i)
    for token in np.unique(total_tagged_tokens):
        if words_count == 0:
            tf = 0
        else:
        
            tf = counter[token]/words_count
            df = doc_freq(token)
            idf = np.log((N+1)/(df+1))
        
            tf_idf[doc, token] = tf*idf

    doc += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

# TF-IDF für den Titel

In [38]:
doc = 0

tf_idf_title = {}

for i in range(N):
    
    tokens = data["tagged_titels"][i]
    counter = Counter(tokens + data["tagged_tokens"][i])
    words_count = len(tokens + data["tagged_tokens"][i])
    print(i)

    for token in np.unique(total_tagged_tokens):
        if words_count == 0:
            tf = 0
        else:
            tf = counter[token]/words_count
            df = doc_freq(token)
            idf = np.log((N+1)/(df+1)) #numerator is added 1 to avoid negative values
        
            tf_idf_title[doc, token] = tf*idf

    doc += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [39]:
# Alpha
alpha = 0.3

In [40]:
total_tagged_tokens


['wichtigsten',
 'projekte',
 'flaneure',
 'projekte',
 'fassadenrenovierung',
 'hauptgebäude',
 'umfassende',
 'innenhofs',
 'campus',
 'inffeldgasse',
 'technischen',
 'neubauten',
 'investitionsvolumen',
 'five',
 'gebäude',
 'eight',
 'großen',
 'grundstücksfläche',
 'raumfläche',
 'two',
 'and',
 'neuen',
 'tu',
 'zukunftsbereichen',
 'medizinische',
 'modernsten',
 'zentren',
 'medizinische',
 'forschung',
 'europa',
 'module',
 'anatomie',
 'größten',
 'universitätsbauten',
 'neue',
 'anatomie',
 'med',
 'historische',
 'pathologie',
 'neubau',
 'hörsaal',
 'five',
 'hundred',
 'seziersäle',
 'tischen',
 'historische',
 'moderne',
 'ergänzung',
 'innenhof',
 'graz',
 'universität',
 'darstellende',
 'landeshauptstadt',
 'ausbau',
 'sanierung',
 'institutsgebäudes',
 'brandhofgasse',
 'plan',
 'zuge',
 'erweiterung',
 'institutsgebäude',
 'bibliotheksgebäude',
 'zubau',
 'helle',
 'architekten',
 'neues',
 'gläserne',
 'gelbgoldenen',
 'lamellen',
 'harfensaiten',
 'bestand',
 'z

# Create TF-IDF Dictionary

In [41]:
for i in tf_idf:
    tf_idf[i] *= alpha

In [42]:
for i in tf_idf_title:
    tf_idf[i] = tf_idf_title[i]

In [131]:
tf_idf

{(0, '%'): 0.0,
 (0, "'Fehler"): 0.0,
 (0, "'Sozialen"): 0.0,
 (0, '+3'): 0.0,
 (0, ',1984'): 0.0,
 (0, '-1'): 0.0,
 (0, '-Aktionär'): 0.0,
 (0, '-Aktualisiert'): 0.0,
 (0, '-Ausfällen'): 0.0,
 (0, '-Berater'): 0.0,
 (0, '-Bericht'): 0.0,
 (0, '-Bewegung'): 0.0,
 (0, '-Bücher'): 0.0,
 (0, '-Bürgern'): 0.0,
 (0, '-CEO'): 0.0,
 (0, '-Dokumente'): 0.0,
 (0, '-Drohungen'): 0.0,
 (0, '-Emissionen'): 0.0,
 (0, '-Fragen'): 0.0,
 (0, '-Investor'): 0.0,
 (0, '-Konkurrenz'): 0.0,
 (0, '-Kratzen'): 0.0,
 (0, '-Literatur'): 0.0,
 (0, '-Medien'): 0.0,
 (0, '-Papers'): 0.0,
 (0, '-Partnerschaften'): 0.0,
 (0, '-Produkte'): 0.0,
 (0, '-Software'): 0.0,
 (0, '-Spielen'): 0.0,
 (0, '-Turnier'): 0.0,
 (0, '-Videos'): 0.0,
 (0, '-Zeitung'): 0.0,
 (0, '-analyse'): 0.0,
 (0, '-anleihen'): 0.0,
 (0, '-aufteilungen'): 0.0,
 (0, '-beschreibung'): 0.0,
 (0, '-bewohnerinnen'): 0.0,
 (0, '-entwicklung'): 0.0,
 (0, '-findern'): 0.0,
 (0, '-folgenabschätzung'): 0.0,
 (0, '-geschwindigkeit'): 0.0,
 (0, '-handel'): 

# Extract Top 5 Key Words per Article

In [43]:
values_total = []
token_total = []
docs_total = []

for i in range(len(data["cleaned_text"])):
    values=[]
    document=[]
    token = []
    for key in tf_idf:
        if key[0] == i:
            document.append(key[0])
            values.append(key[1])
            token.append(tf_idf[key])
        else:
            pass
        
    print(i)      
    df = pd.DataFrame()
    df["values"] = values
    df["token"] = token
    df["doc"] = document
    df = df.sort_values(by='token', ascending=False)
    df = df.head()
    values_total.append(df["values"].to_list())
    token_total.append(df["token"].to_list())
    docs_total.append(df["doc"].to_list())

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [46]:
docs_ = []
values_1 = []
values_2 = []
values_3 = []
values_4 = []
values_5 = []

for i in range(len(values_total)):
    if len(values_total[i]) == 5:
        values_1.append(values_total[i][0])
        values_2.append(values_total[i][1])
        values_3.append(values_total[i][2])
        values_4.append(values_total[i][3])
        values_5.append(values_total[i][4])
    else:
        values_1.append("")
        values_2.append("")
        values_3.append("")
        values_4.append("")
        values_5.append("")


### Spalten der Key Words zum Dataframe hinzufügen

In [45]:
values_total[1]

['abwanderungswelle', 'attraktiver', 'verstärkter', 'bürotürme', 'markanten']

In [47]:
data["key_word1"] = values_1
data["key_word2"] = values_2
data["key_word3"] = values_3
data["key_word4"] = values_4
data["key_word5"] = values_5

In [48]:
all_keywords = []

for i in range(len(data)):
    all_keywords.append(data["key_word1"][i])
    all_keywords.append(data["key_word2"][i])
    all_keywords.append(data["key_word2"][i])
    all_keywords.append(data["key_word4"][i])
    all_keywords.append(data["key_word5"][i])
    

In [49]:
data.to_csv("../data/news_keywords.csv",index=False,sep=";")

In [50]:
keywords_data = pd.DataFrame()
keywords_data["Keywords"] = all_keywords

In [51]:
data.to_csv("../data/news_justallkeywords.csv",index=False,sep=";")

# Ranking using Matching Score

In [290]:
def matching_score(k, query):
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))

    print("Matching Score")
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    query_weights = {}

    for key in tf_idf:
        
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)

    print("")
    
    l = []
    
    for i in query_weights[:10]:
        l.append(i[0])
    
    print(l)
    

matching_score(10, "Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying")

Matching Score

Query: Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying

['without', 'the', 'drive', 'of', 'rebeccah', 'insist', 'kate', 'lost', 'her', 'momentum', 'she', 'stood', 'next', 'slat', 'oak', 'bench', 'canist', 'still', 'clutch', 'survey']

[345, 184, 362, 514, 175, 495, 961, 333, 8, 57]


# Ranking Cosine Similarity

In [291]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim


In [292]:
D = np.zeros((N, total_vocab_size))
for i in tf_idf:
    try:
        ind = total_vocab.index(i[1])
        D[i[0]][ind] = tf_idf[i]
    except:
        pass

In [293]:
def gen_vector(tokens):

    Q = np.zeros((len(total_vocab)))
    
    counter = Counter(tokens)
    words_count = len(tokens)

    query_weights = {}
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = math.log((N+1)/(df+1))

        try:
            ind = total_vocab.index(token)
            Q[ind] = tf*idf
        except:
            pass
    return Q

In [294]:
def cosine_similarity(k, query):
    print("Cosine Similarity")
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))
    
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    d_cosines = []
    
    query_vector = gen_vector(tokens)
    
    for d in D:
        d_cosines.append(cosine_sim(query_vector, d))
        
    out = np.array(d_cosines).argsort()[-k:][::-1]
    
    print("")
    
    print(out)

#     for i in out:
#         print(i, dataset[i][0])

Q = cosine_similarity(10, "Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying")

Cosine Similarity

Query: Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying

['without', 'the', 'drive', 'of', 'rebeccah', 'insist', 'kate', 'lost', 'her', 'momentum', 'she', 'stood', 'next', 'slat', 'oak', 'bench', 'canist', 'still', 'clutch', 'survey']

[ 184  362  345  619  961  514  175  895 1003  542]


In [52]:
data.head()

Unnamed: 0,Datum,Titel,Text,Link,Quelle,Autor,count_words,count_data,count_capital_words,cleaned_text,...,tokens_title,tokens_uncleaned,tagged_tokens,titel_uncleaned,tagged_titels,key_word1,key_word2,key_word3,key_word4,key_word5
0,23.10.2020,Steirische Unis bauen trotz Coronakrise aus,und das sind die wichtigsten projekte die akt...,https://www.krone.at/2259121,Krone,Jörg Schwaiger,356,0,4,wichtigsten projekt aktuel graz leoben laufen ...,...,"[Steirische, Unis, bauen, trotz, Coronakrise, ...","[und, das, sind, die, wichtigsten, projekte, d...","[wichtigsten, projekte, flaneure, projekte, fa...","[Steirische, Unis, bauen, trotz, Coronakrise, ...","[Steirische, Unis, Coronakrise]",anatomie,historische,medizinische,graz,gelbgoldenen
1,14.12.2020,Immer mehr Firmen verlassen das Silicon Valley,dort habe das unternehmen bereits seinen größ...,https://www.krone.at/2297846,Krone,,199,0,2,unternehmen bereit größten beschäftigungsschwe...,...,"[Immer, mehr, Firmen, verlassen, das, Silicon,...","[dort, habe, das, unternehmen, bereits, seinen...","[unternehmen, größten, usa, attraktiver, künft...","[Immer, mehr, Firmen, verlassen, das, Silicon,...","[Firmen, Silicon, Valley]",abwanderungswelle,attraktiver,verstärkter,bürotürme,markanten
2,07.12.2020,Top-Waffenproduzenten nahmen fast 300 Mrd. € ein,bei zwölf der twenty five größten rüstungskon...,https://www.krone.at/2292763,Krone,,257,0,8,zwölf größten rüstungskonzern handelt unterneh...,...,"[Top-Waffenproduzenten, nahmen, fast, 300, Mrd...","[bei, zwölf, der, twenty, five, größten, rüstu...","[twenty, five, größten, rüstungskonzerne, unte...","[Top-Waffenproduzenten, nahmen, fast, 300, Mrd...","[Top-Waffenproduzenten, Mrd]",ranking,twenty,top,exporten,waffenproduzenten
3,01.01.2021,Corona: Israel hat schon 1 Million Bürger geimpft,eine grafik auf der website „ our world in da...,https://www.krone.at/2309676,Krone,,336,0,1,grafik websit „our world data“ vergleicht vers...,...,"[Corona, :, Israel, hat, schon, 1, Million, Bü...","[eine, grafik, auf, der, website, „, our, worl...","[grafik, website, verschiedene, staaten, zahl,...","[Corona, :, Israel, hat, schon, 1, Million, Bü...","[Corona, Israel, Million, Bürger]",israel,anfängliche,corona,öffentliches,gesundheitsministeriums
4,15.12.2020,Firmen informieren oft nicht über Mitarbeiterd...,während nur rund die hälfte der unternehmen i...,https://www.krone.at/2298543,Krone,,142,0,3,rund hälfte unternehmen mitarbeit speicherung ...,...,"[Firmen, informieren, oft, nicht, über, Mitarb...","[während, nur, rund, die, hälfte, der, unterne...","[hälfte, unternehmen, mitarbeiter, speicherung...","[Firmen, informieren, oft, nicht, über, Mitarb...","[Firmen, Mitarbeiterdaten]",ak,betriebsvereinbarungen,einkommensungleichheiten,unternehmensdaten,mitarbeiterbezogener


In [53]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1093 entries, 0 to 1092
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Datum                1029 non-null   object
 1   Titel                1029 non-null   object
 2   Text                 1093 non-null   object
 3   Link                 1029 non-null   object
 4   Quelle               1090 non-null   object
 5   Autor                337 non-null    object
 6   count_words          1093 non-null   int64 
 7   count_data           1093 non-null   int64 
 8   count_capital_words  1093 non-null   int64 
 9   cleaned_text         1026 non-null   object
 10  origin               1093 non-null   object
 11  Autor                550 non-null    object
 12  tokens               1093 non-null   object
 13  tokens_title         1093 non-null   object
 14  tokens_uncleaned     1093 non-null   object
 15  tagged_tokens        1093 non-null   object
 16  titel_