In [1]:
# Syntactic Similarity
# sila Nov. 19 2022
# 
# Based on >>Blueprints for Text Analysis Using Python<<
# Jens Albrecht, Sidharth Ramachandran, Christian Winkler
# Chapter 5

In [3]:
# Simple count in a vector

In [4]:
sentences = ["It was the best of times", 
             "it was the worst of times", 
             "it was the age of wisdom", 
             "it was the age of foolishness"]

tokenized_sentences = [[t for t in sentence.split()] for sentence in sentences]

vocabulary = set([w for s in tokenized_sentences for w in s])

import pandas as pd
[[w, i] for i,w in enumerate(vocabulary)]

[['the', 0],
 ['of', 1],
 ['It', 2],
 ['worst', 3],
 ['it', 4],
 ['foolishness', 5],
 ['times', 6],
 ['age', 7],
 ['wisdom', 8],
 ['was', 9],
 ['best', 10]]

In [5]:
# One hot by hand

In [14]:
def onehot_encode(tokenized_sentence):
    return [1 if w in tokenized_sentence else 0 for w in vocabulary]

onehot = [onehot_encode(tokenized_sentence) for tokenized_sentence in tokenized_sentences]
tokens = "It was the best of times".split()
print(tokens)
print(vocabulary)
print(onehot_encode(tokens))

['It', 'was', 'the', 'best', 'of', 'times']
{'the', 'of', 'It', 'worst', 'it', 'foolishness', 'times', 'age', 'wisdom', 'was', 'best'}
[1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1]


In [15]:
for (sentence, oh) in zip(sentences, onehot):
    print("%s: %s" % (oh, sentence))

[1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1]: It was the best of times
[1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0]: it was the worst of times
[1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0]: it was the age of wisdom
[1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0]: it was the age of foolishness


In [16]:
# Compare to vectors
import numpy as np
np.dot(onehot[0], onehot[1])

4

In [17]:
np.dot(onehot, onehot[1])

array([4, 6, 4, 4])

In [19]:
# As expected vector 1 is most similar to itself

In [21]:
# Vectors for
# Out of vocabulary
onehot_encode("the age of wisdom is the best of times".split())

[1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1]

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [23]:
more_sentences = sentences + ["John likes to watch movies. Mary likes movies too.",
                              "Mary also likes to watch football games."]
pd.DataFrame(more_sentences)

Unnamed: 0,0
0,It was the best of times
1,it was the worst of times
2,it was the age of wisdom
3,it was the age of foolishness
4,John likes to watch movies. Mary likes movies ...
5,Mary also likes to watch football games.


In [24]:
cv.fit(more_sentences)

CountVectorizer()

In [25]:
print(cv.get_feature_names())

['age', 'also', 'best', 'foolishness', 'football', 'games', 'it', 'john', 'likes', 'mary', 'movies', 'of', 'the', 'times', 'to', 'too', 'was', 'watch', 'wisdom', 'worst']




In [29]:
dt = cv.transform(more_sentences)
#print(dt)

In [30]:
pd.DataFrame(dt.toarray(), columns=cv.get_feature_names())



Unnamed: 0,age,also,best,foolishness,football,games,it,john,likes,mary,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0,0,1,0,0,0,1,0,0,0,0,1,1,1,0,0,1,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,1,1,1,0,0,1,0,0,1
2,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0
3,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,2,1,2,0,0,0,1,1,0,1,0,0
5,0,1,0,0,1,1,0,0,1,1,0,0,0,0,1,0,0,1,0,0


In [31]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(dt[0], dt[1])

array([[0.83333333]])

In [32]:
cosine_similarity(dt[2], dt[3])

array([[0.83333333]])

In [33]:
cosine_similarity(dt[1], dt[3])

array([[0.66666667]])

In [37]:
# Using TF/IDF

In [38]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
tfidf_dt = tfidf.fit_transform(dt)

In [39]:
pd.DataFrame(tfidf_dt.toarray(), columns=cv.get_feature_names())



Unnamed: 0,age,also,best,foolishness,football,games,it,john,likes,mary,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0.0,0.0,0.56978,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.467228,0.0,0.0,0.338027,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.467228,0.0,0.0,0.338027,0.0,0.0,0.56978
2,0.467228,0.0,0.0,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.0,0.0,0.0,0.338027,0.0,0.56978,0.0
3,0.467228,0.0,0.0,0.56978,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.0,0.0,0.0,0.338027,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.305609,0.501208,0.250604,0.611219,0.0,0.0,0.0,0.250604,0.305609,0.0,0.250604,0.0,0.0
5,0.0,0.419233,0.0,0.0,0.419233,0.419233,0.0,0.0,0.343777,0.343777,0.0,0.0,0.0,0.0,0.343777,0.0,0.0,0.343777,0.0,0.0


In [40]:
pd.DataFrame(cosine_similarity(tfidf_dt, tfidf_dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.675351,0.457049,0.457049,0.0,0.0
1,0.675351,1.0,0.457049,0.457049,0.0,0.0
2,0.457049,0.457049,1.0,0.675351,0.0,0.0
3,0.457049,0.457049,0.675351,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.43076
5,0.0,0.0,0.0,0.0,0.43076,1.0


In [41]:
from google.colab import files
uploaded = files.upload()

Saving abcnews-date-text.csv to abcnews-date-text.csv


In [42]:
# Continue when
# Abcnews file is uploaded to colab (or local machine)

In [43]:
!ls

abcnews-date-text.csv  sample_data


In [44]:
headlines = pd.read_csv('abcnews-date-text.csv', parse_dates=["publish_date"])
headlines.head()

Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting lic...
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise
4,2003-02-19,air nz strike to affect australian travellers


In [45]:
headlines.tail()

Unnamed: 0,publish_date,headline_text
1082163,2017-06-30,when is it ok to compliment a womans smile a g...
1082164,2017-06-30,white house defends trumps tweet
1082165,2017-06-30,winter closes in on tasmania as snow ice falls
1082166,2017-06-30,womens world cup australia wins despite atapat...
1082167,2017-06-30,youtube stunt death foreshadowed by tweet


In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
dt = tfidf.fit_transform(headlines["headline_text"])

In [51]:
print(tfidf.get_feature_names_out())

['000' '000app' '002' ... 'zyngier' 'zz' 'zzz']


In [52]:
print(dt.shape)

(1082168, 95999)


In [53]:
print(dt.data.nbytes)

54644968


In [54]:
%%time
cosine_similarity(dt[0:10000], dt[0:10000])

CPU times: user 272 ms, sys: 491 ms, total: 764 ms
Wall time: 770 ms


array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.16871665,
        0.16767302],
       [0.        , 0.        , 0.        , ..., 0.16871665, 1.        ,
        0.33175557],
       [0.        , 0.        , 0.        , ..., 0.16767302, 0.33175557,
        1.        ]])

In [55]:
# Finding document most similar to made-up document

In [60]:
dt = tfidf.fit_transform(headlines["headline_text"])

In [61]:
made_up = tfidf.transform(["australia and new zealand discuss optimal apple size"])

In [62]:
sim = cosine_similarity(made_up, dt)

In [63]:
print(sim[0])

[0.         0.         0.         ... 0.         0.05652862 0.        ]


In [65]:
headlines.iloc[np.argsort(sim[0])[::-1][0:5]][["publish_date", "headline_text"]]

Unnamed: 0,publish_date,headline_text
633411,2011-08-17,new zealand apple imports
633410,2011-08-17,new zealand apple import
896722,2014-08-12,why size matters for apple
633412,2011-08-17,new zealand apple industry hurting
873937,2014-05-09,call for australia and new zealand to give job


In [76]:
# Removing stop-words, top 10.000 words (from Google index), working on Lemmas only
# etc will probably improve performance. 
# See chapter 5 in the Blueprints book for details. 

In [78]:
# Another example of finding syntactic similarity with this code:

In [79]:
new_made_up = tfidf.transform(["Trump tweet from White house"])

In [80]:
sim = cosine_similarity(new_made_up, dt)

In [81]:
headlines.iloc[np.argsort(sim[0])[::-1][0:10]][["publish_date", "headline_text"]]

Unnamed: 0,publish_date,headline_text
1082164,2017-06-30,white house defends trumps tweet
1052426,2016-11-10,can donald trump redecorate the white house
712816,2012-08-01,to tweet or not to tweet
1078911,2017-06-02,meanwhile; back at the white house
1052759,2016-11-11,obama and trump meet at the white house
1054873,2016-11-25,who is part of donald trump white house team
703365,2012-06-19,tweet tweet tweeting
1069653,2017-03-18,trump and merkel meet at white house
960635,2015-06-17,donald trump announces run for the white house
960915,2015-06-18,donald trump joins race for the white house
