# Kapitel 5: Feature Engineering und syntaktische Ähnlichkeit

## Setup

Es werden die Verzeichnisse festgelegt. Wenn Sie mit Google Colab arbeiten: Die erforderlichen Dateien werden kopiert und die erforderlichen Bibliotheken installiert.

In [1]:
import sys, os
ON_COLAB = 'google.colab' in sys.modules

if ON_COLAB:
    GIT_ROOT = 'https://github.com/blueprints-for-text-analytics-python/blueprints-text/raw/master'
    os.system(f'wget {GIT_ROOT}/ch05/setup.py')

%run -i setup.py

You are working on a local system.
Files will be searched relative to "..".


## Python-Einstellungen laden

Allgemeine Importe, Standardwerte für die Formatierung in Matplotlib, Pandas usw.

In [2]:
%run "$BASE_DIR/settings.py"

%reload_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'png'

# 1. Blueprint: Bau eines eigenen Vectorizers

In [3]:
sentences = ["It was the best of times", 
             "it was the worst of times", 
             "it was the age of wisdom", 
             "it was the age of foolishness"]

tokenized_sentences = [[t for t in sentence.split()] for sentence in sentences]

vocabulary = set([w for s in tokenized_sentences for w in s])

import pandas as pd
pd.DataFrame([[w, i] for i,w in enumerate(vocabulary)])

Unnamed: 0,0,1
0,the,0
1,times,1
2,of,2
3,was,3
4,wisdom,4
5,It,5
6,worst,6
7,foolishness,7
8,it,8
9,age,9


## Dokumente vektorisieren
## One-hot-Vektor von Hand

In [4]:
def onehot_encode(tokenized_sentence):
    return [1 if w in tokenized_sentence else 0 for w in vocabulary]

onehot = [onehot_encode(tokenized_sentence) for tokenized_sentence in tokenized_sentences]

for (sentence, oh) in zip(sentences, onehot):
    print("%s: %s" % (oh, sentence))

[1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1]: It was the best of times
[1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0]: it was the worst of times
[1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0]: it was the age of wisdom
[1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0]: it was the age of foolishness


In [5]:
pd.DataFrame(onehot, columns=vocabulary)

Unnamed: 0,the,times,of,was,wisdom,It,worst,foolishness,it,age,best
0,1,1,1,1,0,1,0,0,0,0,1
1,1,1,1,1,0,0,1,0,1,0,0
2,1,0,1,1,1,0,0,0,1,1,0
3,1,0,1,1,0,0,0,1,1,1,0


In [6]:
sim = [onehot[0][i] & onehot[1][i] for i in range(0, len(vocabulary))]
sum(sim)

4

In [7]:
import numpy as np
np.dot(onehot[0], onehot[1])

4

In [8]:
np.dot(onehot, onehot[1])

array([4, 6, 4, 4])

## Dokumente, die dem Vokabular nicht entsprechen

In [9]:
onehot_encode("the age of wisdom is the best of times".split())

[1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1]

In [10]:
onehot_encode("John likes to watch movies. Mary likes movies too.".split())

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

## Die Dokumentenbegriff-Matrix

In [11]:
onehot

[[1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1],
 [1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0],
 [1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0],
 [1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0]]

## Berechnung der Ähnlichkeiten

In [12]:
import numpy as np
np.dot(onehot, np.transpose(onehot))

array([[6, 4, 3, 3],
       [4, 6, 4, 4],
       [3, 4, 6, 5],
       [3, 4, 5, 6]])

## One-Hot-Vektorisierung in scikit learn 

In [13]:
from sklearn.preprocessing import MultiLabelBinarizer
lb = MultiLabelBinarizer()
lb.fit([vocabulary])
lb.transform(tokenized_sentences)

array([[1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0],
       [0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1],
       [0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0],
       [0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0]])

# Bag-of-Word Modelle

# 2. Blueprint: CountVectorizer von scikit-learn verwenden

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [15]:
more_sentences = sentences + ["John likes to watch movies. Mary likes movies too.",
                              "Mary also likes to watch football games."]
pd.DataFrame(more_sentences)

Unnamed: 0,0
0,It was the best of times
1,it was the worst of times
2,it was the age of wisdom
3,it was the age of foolishness
4,John likes to watch movies. Mary likes movies too.
5,Mary also likes to watch football games.


## Die Anpassung des Vokabulars

In [16]:
cv.fit(more_sentences)

CountVectorizer()

In [17]:
print(cv.get_feature_names())

['age', 'also', 'best', 'foolishness', 'football', 'games', 'it', 'john', 'likes', 'mary', 'movies', 'of', 'the', 'times', 'to', 'too', 'was', 'watch', 'wisdom', 'worst']


## Umwandlung der Dokumente in Vektoren

In [18]:
dt = cv.transform(more_sentences)

In [19]:
dt

<6x20 sparse matrix of type '<class 'numpy.int64'>'
	with 38 stored elements in Compressed Sparse Row format>

In [20]:
pd.DataFrame(dt.toarray(), columns=cv.get_feature_names())

Unnamed: 0,age,also,best,foolishness,football,games,it,john,likes,mary,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0,0,1,0,0,0,1,0,0,0,0,1,1,1,0,0,1,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,1,1,1,0,0,1,0,0,1
2,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0
3,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,2,1,2,0,0,0,1,1,0,1,0,0
5,0,1,0,0,1,1,0,0,1,1,0,0,0,0,1,0,0,1,0,0


In [21]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(dt[0], dt[1])

array([[0.83333333]])

In [22]:
len(more_sentences)

6

In [23]:
pd.DataFrame(cosine_similarity(dt, dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.83,0.67,0.67,0.0,0.0
1,0.83,1.0,0.67,0.67,0.0,0.0
2,0.67,0.67,1.0,0.83,0.0,0.0
3,0.67,0.67,0.83,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.52
5,0.0,0.0,0.0,0.0,0.52,1.0


# TF-IDF Modelle

## Optimierung von Dokumentverktoren mit TfidfTransformer

In [24]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
tfidf_dt = tfidf.fit_transform(dt)

In [25]:
pd.DataFrame(tfidf_dt.toarray(), columns=cv.get_feature_names())

Unnamed: 0,age,also,best,foolishness,football,games,it,john,likes,mary,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0.0,0.0,0.57,0.0,0.0,0.0,0.34,0.0,0.0,0.0,0.0,0.34,0.34,0.47,0.0,0.0,0.34,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.34,0.0,0.0,0.0,0.0,0.34,0.34,0.47,0.0,0.0,0.34,0.0,0.0,0.57
2,0.47,0.0,0.0,0.0,0.0,0.0,0.34,0.0,0.0,0.0,0.0,0.34,0.34,0.0,0.0,0.0,0.34,0.0,0.57,0.0
3,0.47,0.0,0.0,0.57,0.0,0.0,0.34,0.0,0.0,0.0,0.0,0.34,0.34,0.0,0.0,0.0,0.34,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.31,0.5,0.25,0.61,0.0,0.0,0.0,0.25,0.31,0.0,0.25,0.0,0.0
5,0.0,0.42,0.0,0.0,0.42,0.42,0.0,0.0,0.34,0.34,0.0,0.0,0.0,0.0,0.34,0.0,0.0,0.34,0.0,0.0


In [26]:
pd.DataFrame(cosine_similarity(tfidf_dt, tfidf_dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.68,0.46,0.46,0.0,0.0
1,0.68,1.0,0.46,0.46,0.0,0.0
2,0.46,0.46,1.0,0.68,0.0,0.0
3,0.46,0.46,0.68,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.43
5,0.0,0.0,0.0,0.0,0.43,1.0


## Einführung in den ABC-Datensatz

In [27]:
headlines = pd.read_csv(ABCNEWS_FILE, parse_dates=["publish_date"])
headlines.head()

Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting licence
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise
4,2003-02-19,air nz strike to affect australian travellers


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
dt = tfidf.fit_transform(headlines["headline_text"])

In [29]:
dt

<1103663x95878 sparse matrix of type '<class 'numpy.float64'>'
	with 7001357 stored elements in Compressed Sparse Row format>

In [30]:
dt.data.nbytes

56010856

In [31]:
%%time
cosine_similarity(dt[0:10000], dt[0:10000])

CPU times: total: 469 ms
Wall time: 483 ms


array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.16913596,
        0.16792138],
       [0.        , 0.        , 0.        , ..., 0.16913596, 1.        ,
        0.33258708],
       [0.        , 0.        , 0.        , ..., 0.16792138, 0.33258708,
        1.        ]])

# 3. Blueprint: Verringerung der Merkmalsdimensionen

## Entfernen von Stoppwörtern

In [32]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
print(len(stopwords))
tfidf = TfidfVectorizer(stop_words=stopwords)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

326


<1103663x95600 sparse matrix of type '<class 'numpy.float64'>'
	with 5644186 stored elements in Compressed Sparse Row format>

## Minimierung der Häufigkeit

In [33]:
tfidf = TfidfVectorizer(stop_words=stopwords, min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1103663x58527 sparse matrix of type '<class 'numpy.float64'>'
	with 5607113 stored elements in Compressed Sparse Row format>

In [34]:
tfidf = TfidfVectorizer(stop_words=stopwords, min_df=.0001)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1103663x6772 sparse matrix of type '<class 'numpy.float64'>'
	with 4816381 stored elements in Compressed Sparse Row format>

## Maximierung der Häufigkeit

In [35]:
tfidf = TfidfVectorizer(stop_words=stopwords, max_df=0.1)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1103663x95600 sparse matrix of type '<class 'numpy.float64'>'
	with 5644186 stored elements in Compressed Sparse Row format>

In [36]:
tfidf = TfidfVectorizer(max_df=0.1)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1103663x95875 sparse matrix of type '<class 'numpy.float64'>'
	with 6532752 stored elements in Compressed Sparse Row format>

# 4. Blueprint: Hinzufügen von Kontext über N-Gramme

## N-Gramme

In [37]:
tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
print(dt.shape)
print(dt.data.nbytes)
tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,3), min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
print(dt.shape)
print(dt.data.nbytes)

(1103663, 559961)
67325400
(1103663, 747988)
72360104


## Lemmata

In [38]:
# ACHTUNG! Diese Analyse dauert sehr lange!!!

from tqdm.auto import tqdm
import spacy
nlp = spacy.load("en_core_web_sm")
nouns_adjectives_verbs = ["NOUN", "PROPN", "ADJ", "ADV", "VERB"]
for i, row in tqdm(headlines.iterrows(), total=len(headlines)):
    doc = nlp(str(row["headline_text"]))
    headlines.at[i, "lemmas"] = " ".join([token.lemma_ for token in doc])
    headlines.at[i, "nav"] = " ".join([token.lemma_ for token in doc if token.pos_ in nouns_adjectives_verbs])

  0%|          | 0/1103663 [00:00<?, ?it/s]

In [39]:
headlines.head()

Unnamed: 0,publish_date,headline_text,lemmas,nav
0,2003-02-19,aba decides against community broadcasting licence,aba decide against community broadcasting licence,aba decide community broadcasting licence
1,2003-02-19,act fire witnesses must be aware of defamation,act fire witness must be aware of defamation,act fire witness must aware defamation
2,2003-02-19,a g calls for infrastructure protection summit,a g call for infrastructure protection summit,g call infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise,air nz staff in aust strike for pay rise,air staff aust strike pay rise
4,2003-02-19,air nz strike to affect australian travellers,air nz strike to affect australian traveller,air strike affect australian traveller


In [40]:
tfidf = TfidfVectorizer(stop_words=stopwords)
dt = tfidf.fit_transform(headlines["lemmas"].map(str))
dt

<1103663x87006 sparse matrix of type '<class 'numpy.float64'>'
	with 5581207 stored elements in Compressed Sparse Row format>

In [41]:
tfidf = TfidfVectorizer(stop_words=stopwords)
dt = tfidf.fit_transform(headlines["nav"].map(str))
dt

<1103663x84041 sparse matrix of type '<class 'numpy.float64'>'
	with 5457179 stored elements in Compressed Sparse Row format>

## Häufigste Wörter entfernen

In [42]:
top_10000 = pd.read_csv("https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt", header=None)
tfidf = TfidfVectorizer(stop_words=set(top_10000.iloc[:,0].values))
dt = tfidf.fit_transform(headlines["nav"].map(str))
dt

<1103663x75922 sparse matrix of type '<class 'numpy.float64'>'
	with 1383269 stored elements in Compressed Sparse Row format>

## Kombination von n-Grammen mit linguistischen Merkmalen und den häufigsten Wörtern

In [43]:
tfidf = TfidfVectorizer(ngram_range=(1,2), stop_words=set(top_10000.iloc[:,0].values), min_df=2)
dt = tfidf.fit_transform(headlines["nav"].map(str))
dt

<1103663x97690 sparse matrix of type '<class 'numpy.float64'>'
	with 1556398 stored elements in Compressed Sparse Row format>

# 5. Blueprint: Suche nach dem Dokument, das dem erfundenen Dokument am ähnlichsten ist

In [44]:
tfidf = TfidfVectorizer(stop_words=stopwords, min_df=2)
dt = tfidf.fit_transform(headlines["lemmas"].map(str))
dt

<1103663x51498 sparse matrix of type '<class 'numpy.float64'>'
	with 5545699 stored elements in Compressed Sparse Row format>

In [45]:
made_up = tfidf.transform(["australia and new zealand discuss optimal apple size"])

In [46]:
sim = cosine_similarity(made_up, dt)

In [47]:
sim[0]

array([0.        , 0.        , 0.        , ..., 0.08636621, 0.        ,
       0.        ])

In [48]:
headlines.iloc[np.argsort(sim[0])[::-1][0:5]][["publish_date", "lemmas"]]

Unnamed: 0,publish_date,lemmas
633392,2011-08-17,new zealand apple import
633391,2011-08-17,new zealand apple import
896666,2014-08-12,why size matter for apple
825012,2013-10-08,diamond coach discuss australia 's win over new zealand
633559,2011-08-18,federal push to ban new zealand apple


## Syntaktische Ähnlichkeit im ABC-Datensatz

In [49]:
# es gibt "test" Schlagzeilen im Korpus
stopwords.add("test")
tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), min_df=2, norm='l2')
dt = tfidf.fit_transform(headlines["headline_text"])

# 6. Blueprint: Finden der ähnlichsten Schlagzeilen zu einer erfundenen Schlagzeile

In [50]:
%%time
cosine_similarity(dt[0:10000], dt[0:10000], dense_output=False)

CPU times: total: 31.2 ms
Wall time: 25 ms


<10000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 1817473 stored elements in Compressed Sparse Row format>

In [51]:
%%time
r = cosine_similarity(dt[0:10000], dt[0:10000])
r[r > 0.9999] = 0
print(np.argmax(r))

33053694
CPU times: total: 516 ms
Wall time: 519 ms


In [52]:
%%time
r = cosine_similarity(dt[0:10000], dt[0:10000], dense_output=False)
r[r > 0.9999] = 0
print(np.argmax(r))

36943305
CPU times: total: 500 ms
Wall time: 509 ms


# 7. Blueprint: Auffinden der zwei ähnlichsten Dokumente in einem großen Korpus

In [53]:
%%time
r = np.dot(dt[0:10000], np.transpose(dt[0:10000]))
r[r > 0.9999] = 0
print(np.argmax(r))

36943305
CPU times: total: 406 ms
Wall time: 410 ms


In [54]:
%%time

### Achtung!!! Dauert sehr lange!!!
batch = 10000 ###
max_sim = 0.0
max_a = None
max_b = None
for a in range(0, dt.shape[0], batch):
    for b in range(0, a+batch, batch):
        print(a, b)
        #r = np.dot(dt[a:a+batch], np.transpose(dt[b:b+batch]))
        r = cosine_similarity(dt[a:a+batch], dt[b:b+batch], dense_output=False)
        # identische Vektoren eliminieren
        # indem man ihre Ähnlichkeit auf np.nan setzt, was aussortiert wird
        r[r > 0.9999] = 0
        sim = r.max()
        if sim > max_sim:
            # argmax gibt einen einzigen Wert zurück, den wir 
            # den beiden Dimensionen zuordnen müssen            
            (max_a, max_b) = np.unravel_index(np.argmax(r), r.shape)
            # Offsets im Korpus anpassen (dies ist eine Untermatrix)
            max_a += a
            max_b += b
            max_sim = sim

0 0
10000 0
10000 10000
20000 0
20000 10000
20000 20000
30000 0
30000 10000
30000 20000
30000 30000
40000 0
40000 10000
40000 20000
40000 30000
40000 40000
50000 0
50000 10000
50000 20000
50000 30000
50000 40000
50000 50000
60000 0
60000 10000
60000 20000
60000 30000
60000 40000
60000 50000
60000 60000
70000 0
70000 10000
70000 20000
70000 30000
70000 40000
70000 50000
70000 60000
70000 70000
80000 0
80000 10000
80000 20000
80000 30000
80000 40000
80000 50000
80000 60000
80000 70000
80000 80000
90000 0
90000 10000
90000 20000
90000 30000
90000 40000
90000 50000
90000 60000
90000 70000
90000 80000
90000 90000
100000 0
100000 10000
100000 20000
100000 30000
100000 40000
100000 50000
100000 60000
100000 70000
100000 80000
100000 90000
100000 100000
110000 0
110000 10000
110000 20000
110000 30000
110000 40000
110000 50000
110000 60000
110000 70000
110000 80000
110000 90000
110000 100000
110000 110000
120000 0
120000 10000
120000 20000
120000 30000
120000 40000
120000 50000
120000 60000
120

340000 270000
340000 280000
340000 290000
340000 300000
340000 310000
340000 320000
340000 330000
340000 340000
350000 0
350000 10000
350000 20000
350000 30000
350000 40000
350000 50000
350000 60000
350000 70000
350000 80000
350000 90000
350000 100000
350000 110000
350000 120000
350000 130000
350000 140000
350000 150000
350000 160000
350000 170000
350000 180000
350000 190000
350000 200000
350000 210000
350000 220000
350000 230000
350000 240000
350000 250000
350000 260000
350000 270000
350000 280000
350000 290000
350000 300000
350000 310000
350000 320000
350000 330000
350000 340000
350000 350000
360000 0
360000 10000
360000 20000
360000 30000
360000 40000
360000 50000
360000 60000
360000 70000
360000 80000
360000 90000
360000 100000
360000 110000
360000 120000
360000 130000
360000 140000
360000 150000
360000 160000
360000 170000
360000 180000
360000 190000
360000 200000
360000 210000
360000 220000
360000 230000
360000 240000
360000 250000
360000 260000
360000 270000
360000 280000
360000

480000 480000
490000 0
490000 10000
490000 20000
490000 30000
490000 40000
490000 50000
490000 60000
490000 70000
490000 80000
490000 90000
490000 100000
490000 110000
490000 120000
490000 130000
490000 140000
490000 150000
490000 160000
490000 170000
490000 180000
490000 190000
490000 200000
490000 210000
490000 220000
490000 230000
490000 240000
490000 250000
490000 260000
490000 270000
490000 280000
490000 290000
490000 300000
490000 310000
490000 320000
490000 330000
490000 340000
490000 350000
490000 360000
490000 370000
490000 380000
490000 390000
490000 400000
490000 410000
490000 420000
490000 430000
490000 440000
490000 450000
490000 460000
490000 470000
490000 480000
490000 490000
500000 0
500000 10000
500000 20000
500000 30000
500000 40000
500000 50000
500000 60000
500000 70000
500000 80000
500000 90000
500000 100000
500000 110000
500000 120000
500000 130000
500000 140000
500000 150000
500000 160000
500000 170000
500000 180000
500000 190000
500000 200000
500000 210000
500000

590000 540000
590000 550000
590000 560000
590000 570000
590000 580000
590000 590000
600000 0
600000 10000
600000 20000
600000 30000
600000 40000
600000 50000
600000 60000
600000 70000
600000 80000
600000 90000
600000 100000
600000 110000
600000 120000
600000 130000
600000 140000
600000 150000
600000 160000
600000 170000
600000 180000
600000 190000
600000 200000
600000 210000
600000 220000
600000 230000
600000 240000
600000 250000
600000 260000
600000 270000
600000 280000
600000 290000
600000 300000
600000 310000
600000 320000
600000 330000
600000 340000
600000 350000
600000 360000
600000 370000
600000 380000
600000 390000
600000 400000
600000 410000
600000 420000
600000 430000
600000 440000
600000 450000
600000 460000
600000 470000
600000 480000
600000 490000
600000 500000
600000 510000
600000 520000
600000 530000
600000 540000
600000 550000
600000 560000
600000 570000
600000 580000
600000 590000
600000 600000
610000 0
610000 10000
610000 20000
610000 30000
610000 40000
610000 50000
61

690000 70000
690000 80000
690000 90000
690000 100000
690000 110000
690000 120000
690000 130000
690000 140000
690000 150000
690000 160000
690000 170000
690000 180000
690000 190000
690000 200000
690000 210000
690000 220000
690000 230000
690000 240000
690000 250000
690000 260000
690000 270000
690000 280000
690000 290000
690000 300000
690000 310000
690000 320000
690000 330000
690000 340000
690000 350000
690000 360000
690000 370000
690000 380000
690000 390000
690000 400000
690000 410000
690000 420000
690000 430000
690000 440000
690000 450000
690000 460000
690000 470000
690000 480000
690000 490000
690000 500000
690000 510000
690000 520000
690000 530000
690000 540000
690000 550000
690000 560000
690000 570000
690000 580000
690000 590000
690000 600000
690000 610000
690000 620000
690000 630000
690000 640000
690000 650000
690000 660000
690000 670000
690000 680000
690000 690000
700000 0
700000 10000
700000 20000
700000 30000
700000 40000
700000 50000
700000 60000
700000 70000
700000 80000
700000 9

770000 130000
770000 140000
770000 150000
770000 160000
770000 170000
770000 180000
770000 190000
770000 200000
770000 210000
770000 220000
770000 230000
770000 240000
770000 250000
770000 260000
770000 270000
770000 280000
770000 290000
770000 300000
770000 310000
770000 320000
770000 330000
770000 340000
770000 350000
770000 360000
770000 370000
770000 380000
770000 390000
770000 400000
770000 410000
770000 420000
770000 430000
770000 440000
770000 450000
770000 460000
770000 470000
770000 480000
770000 490000
770000 500000
770000 510000
770000 520000
770000 530000
770000 540000
770000 550000
770000 560000
770000 570000
770000 580000
770000 590000
770000 600000
770000 610000
770000 620000
770000 630000
770000 640000
770000 650000
770000 660000
770000 670000
770000 680000
770000 690000
770000 700000
770000 710000
770000 720000
770000 730000
770000 740000
770000 750000
770000 760000
770000 770000
780000 0
780000 10000
780000 20000
780000 30000
780000 40000
780000 50000
780000 60000
780

840000 400000
840000 410000
840000 420000
840000 430000
840000 440000
840000 450000
840000 460000
840000 470000
840000 480000
840000 490000
840000 500000
840000 510000
840000 520000
840000 530000
840000 540000
840000 550000
840000 560000
840000 570000
840000 580000
840000 590000
840000 600000
840000 610000
840000 620000
840000 630000
840000 640000
840000 650000
840000 660000
840000 670000
840000 680000
840000 690000
840000 700000
840000 710000
840000 720000
840000 730000
840000 740000
840000 750000
840000 760000
840000 770000
840000 780000
840000 790000
840000 800000
840000 810000
840000 820000
840000 830000
840000 840000
850000 0
850000 10000
850000 20000
850000 30000
850000 40000
850000 50000
850000 60000
850000 70000
850000 80000
850000 90000
850000 100000
850000 110000
850000 120000
850000 130000
850000 140000
850000 150000
850000 160000
850000 170000
850000 180000
850000 190000
850000 200000
850000 210000
850000 220000
850000 230000
850000 240000
850000 250000
850000 260000
850000

910000 190000
910000 200000
910000 210000
910000 220000
910000 230000
910000 240000
910000 250000
910000 260000
910000 270000
910000 280000
910000 290000
910000 300000
910000 310000
910000 320000
910000 330000
910000 340000
910000 350000
910000 360000
910000 370000
910000 380000
910000 390000
910000 400000
910000 410000
910000 420000
910000 430000
910000 440000
910000 450000
910000 460000
910000 470000
910000 480000
910000 490000
910000 500000
910000 510000
910000 520000
910000 530000
910000 540000
910000 550000
910000 560000
910000 570000
910000 580000
910000 590000
910000 600000
910000 610000
910000 620000
910000 630000
910000 640000
910000 650000
910000 660000
910000 670000
910000 680000
910000 690000
910000 700000
910000 710000
910000 720000
910000 730000
910000 740000
910000 750000
910000 760000
910000 770000
910000 780000
910000 790000
910000 800000
910000 810000
910000 820000
910000 830000
910000 840000
910000 850000
910000 860000
910000 870000
910000 880000
910000 890000
910000

970000 440000
970000 450000
970000 460000
970000 470000
970000 480000
970000 490000
970000 500000
970000 510000
970000 520000
970000 530000
970000 540000
970000 550000
970000 560000
970000 570000
970000 580000
970000 590000
970000 600000
970000 610000
970000 620000
970000 630000
970000 640000
970000 650000
970000 660000
970000 670000
970000 680000
970000 690000
970000 700000
970000 710000
970000 720000
970000 730000
970000 740000
970000 750000
970000 760000
970000 770000
970000 780000
970000 790000
970000 800000
970000 810000
970000 820000
970000 830000
970000 840000
970000 850000
970000 860000
970000 870000
970000 880000
970000 890000
970000 900000
970000 910000
970000 920000
970000 930000
970000 940000
970000 950000
970000 960000
970000 970000
980000 0
980000 10000
980000 20000
980000 30000
980000 40000
980000 50000
980000 60000
980000 70000
980000 80000
980000 90000
980000 100000
980000 110000
980000 120000
980000 130000
980000 140000
980000 150000
980000 160000
980000 170000
980000

1030000 110000
1030000 120000
1030000 130000
1030000 140000
1030000 150000
1030000 160000
1030000 170000
1030000 180000
1030000 190000
1030000 200000
1030000 210000
1030000 220000
1030000 230000
1030000 240000
1030000 250000
1030000 260000
1030000 270000
1030000 280000
1030000 290000
1030000 300000
1030000 310000
1030000 320000
1030000 330000
1030000 340000
1030000 350000
1030000 360000
1030000 370000
1030000 380000
1030000 390000
1030000 400000
1030000 410000
1030000 420000
1030000 430000
1030000 440000
1030000 450000
1030000 460000
1030000 470000
1030000 480000
1030000 490000
1030000 500000
1030000 510000
1030000 520000
1030000 530000
1030000 540000
1030000 550000
1030000 560000
1030000 570000
1030000 580000
1030000 590000
1030000 600000
1030000 610000
1030000 620000
1030000 630000
1030000 640000
1030000 650000
1030000 660000
1030000 670000
1030000 680000
1030000 690000
1030000 700000
1030000 710000
1030000 720000
1030000 730000
1030000 740000
1030000 750000
1030000 760000
1030000 77

1080000 320000
1080000 330000
1080000 340000
1080000 350000
1080000 360000
1080000 370000
1080000 380000
1080000 390000
1080000 400000
1080000 410000
1080000 420000
1080000 430000
1080000 440000
1080000 450000
1080000 460000
1080000 470000
1080000 480000
1080000 490000
1080000 500000
1080000 510000
1080000 520000
1080000 530000
1080000 540000
1080000 550000
1080000 560000
1080000 570000
1080000 580000
1080000 590000
1080000 600000
1080000 610000
1080000 620000
1080000 630000
1080000 640000
1080000 650000
1080000 660000
1080000 670000
1080000 680000
1080000 690000
1080000 700000
1080000 710000
1080000 720000
1080000 730000
1080000 740000
1080000 750000
1080000 760000
1080000 770000
1080000 780000
1080000 790000
1080000 800000
1080000 810000
1080000 820000
1080000 830000
1080000 840000
1080000 850000
1080000 860000
1080000 870000
1080000 880000
1080000 890000
1080000 900000
1080000 910000
1080000 920000
1080000 930000
1080000 940000
1080000 950000
1080000 960000
1080000 970000
1080000 98

In [55]:
print(max_a, max_b)

1074265 1074264


In [56]:
print(max_sim)

0.9896553639787652


In [57]:
pd.set_option('max_colwidth', -1)
headlines.iloc[[max_a, max_b]][["publish_date", "headline_text"]]

Unnamed: 0,publish_date,headline_text
1074265,2017-04-24,emmanuel macron the man who could be frances youngest president
1074264,2017-04-24,emmanuel macron: frances youngest president


# 8. Blueprint: Suche nach den am meisten verwandten Wörtern

In [58]:
tfidf_word = TfidfVectorizer(stop_words=stopwords, min_df=1000)
dt_word = tfidf_word.fit_transform(headlines["headline_text"])

In [59]:
r = cosine_similarity(dt_word.T, dt_word.T)
np.fill_diagonal(r, 0)

In [60]:
voc = tfidf_word.get_feature_names()
size = r.shape[0] # quadratisch
for index in np.argsort(r.flatten())[::-1][0:40]:
    a = int(index/size)
    b = index%size
    if a > b:  # Wiederholungen vermeiden
        print('"%s" related to "%s"' % (voc[a], voc[b]))

"sri" related to "lanka"
"hour" related to "country"
"seekers" related to "asylum"
"springs" related to "alice"
"pleads" related to "guilty"
"hill" related to "broken"
"trump" related to "donald"
"violence" related to "domestic"
"climate" related to "change"
"driving" related to "drink"
"care" related to "aged"
"gold" related to "coast"
"royal" related to "commission"
"mental" related to "health"
"wind" related to "farm"
"flu" related to "bird"
"murray" related to "darling"
"world" related to "cup"
"north" related to "korea"
"hour" related to "2014"
