In [61]:
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk import FreqDist
from nltk.tokenize import word_tokenize


In [9]:
documents = [(list(movie_reviews.sents(fileid)), category) \
for category in movie_reviews.categories() \
for fileid in movie_reviews.fileids(category)]

In [51]:
corpus = []
for (docuemnt, category) in documents:
    sentence = ''
    for tokens in docuemnt:
        sentence += (' '.join(tokens))
        # break
    corpus += [sentence]
    

In [55]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names_out()
print(X.shape)
print(features)

(2000, 39659)
['00' '000' '0009f' ... 'zwigoff' 'zycie' 'zzzzzzz']


In [23]:
X_embedded = TSNE(n_components=2, learning_rate='auto',
init='random', perplexity=3).fit_transform(X)
X_embedded.shape

(2000, 2)

In [25]:
cos_sim = cosine_similarity(np.transpose(X))
print(cos_sim.shape)

(39659, 39659)


In [84]:
print(np.where(features == 'good'))
print(np.where(features == 'great'))
print(np.where(features == 'zzzzzzz'))


(array([15003]),)
(array([15246]),)
(array([39658]),)


In [43]:
cos_sim[15003, 15246] #(good vs great)

np.float64(0.37984209242708195)

In [33]:
cos_sim[15003, 20915] #(good vs lousy)


np.float64(0.06776733437228842)

In [35]:
cos_sim[15426, 20915] #(great vs lousy)

np.float64(0.0)

In [63]:
token_freq = {}
for doc in corpus:
    tokens = word_tokenize(doc)
    for token in tokens:
        token_freq[token] = token_freq.get(token, 0) + 1
    

token_freq = FreqDist(token_freq)
        
    

In [86]:
token_freq.most_common(100)

[(',', 77717),
 ('the', 69223),
 ('a', 37296),
 ('and', 34164),
 ('of', 33882),
 ('to', 31688),
 ("'", 30502),
 ('is', 25105),
 ('in', 20352),
 ('s', 18385),
 ('``', 17536),
 ('-', 15559),
 ('that', 15459),
 ('it', 13273),
 (')', 11781),
 ('(', 11664),
 ('as', 10520),
 ('with', 10450),
 ('for', 9518),
 ('film', 9512),
 ('his', 9052),
 ('this', 8095),
 ('.the', 7300),
 ('he', 7291),
 ('on', 7134),
 ('but', 6942),
 ('are', 6931),
 ('i', 6899),
 ('t', 6370),
 ('be', 6146),
 ('by', 6100),
 ('movie', 5768),
 ('who', 5651),
 ('an', 5637),
 ('not', 5457),
 ('one', 5395),
 ('.', 5022),
 ('was', 4923),
 ('you', 4921),
 ('have', 4885),
 ('from', 4877),
 ('has', 4714),
 ('at', 4628),
 ('her', 4347),
 ('they', 4127),
 ('all', 4120),
 ('?', 3771),
 ('out', 3622),
 ('like', 3547),
 ('about', 3513),
 ('up', 3397),
 ('so', 3295),
 ('more', 3295),
 ('which', 3110),
 ('or', 3071),
 (':', 3042),
 ('can', 3003),
 ('their', 2986),
 ('what', 2899),
 ('.it', 2828),
 ('just', 2798),
 ('some', 2790),
 ('him', 

In [92]:
first = np.where(features == "movie")[0][0]
second = np.where(features == "story")[0][0]

print(first)
print(second)

23113
33714


In [94]:
cos_sim[first, second] #(movie vs story)

np.float64(0.3822586697336221)

In [104]:
first = np.where(features == "they")[0][0]
second = np.where(features == "are")[0][0]

print(first)
print(second)

35351
2217


In [106]:
cos_sim[first, second] #(see vs watch)

np.float64(0.6862500932464547)

In [108]:
np.max(cos_sim)

np.float64(1.000000000000004)

In [110]:
np.max(cos_sim[cos_sim<1])

np.float64(0.9999999999999999)

In [122]:
desired_similarity = np.max(cos_sim[cos_sim < 0.99])
desired_similarity

np.float64(0.9899612569064791)

In [114]:
locations = np.where(cos_sim == desired_similarity)

In [116]:
locations

(array([  231,  3135,  4792,  5250,  7857,  9447, 14326, 19665, 19665,
        19665, 19665, 19665, 19665, 19665, 19665, 19665, 19665, 20684,
        21868, 32417]),
 array([19665, 19665, 19665, 19665, 19665, 19665, 19665,   231,  3135,
         4792,  5250,  7857,  9447, 14326, 20684, 21868, 32417, 19665,
        19665, 19665]))

In [128]:
cos_sim[3135, 19665]

np.float64(0.9899612569064791)

In [130]:
features[3135], features[19665]

('baptiste', 'korben')

In [138]:
token_freq['korben'], token_freq['baptiste'], token_freq['2023']

(7, 1, 1)

In [140]:
print(len(token_freq))

44558
