In [1]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import gensim
from gensim.models import FastText
from tabulate import tabulate

In [2]:
nltk.download('wordnet', "/kaggle/working/nltk_data/")
nltk.download('omw-1.4', "/kaggle/working/nltk_data/")
! unzip /kaggle/working/nltk_data/corpora/wordnet.zip -d /kaggle/working/nltk_data/corpora
! unzip /kaggle/working/nltk_data/corpora/omw-1.4.zip -d /kaggle/working/nltk_data/corpora
nltk.data.path.append("/kaggle/working/nltk_data/")

[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data/...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /kaggle/working/nltk_data/...
Archive:  /kaggle/working/nltk_data/corpora/wordnet.zip
   creating: /kaggle/working/nltk_data/corpora/wordnet/
  inflating: /kaggle/working/nltk_data/corpora/wordnet/lexnames  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/data.verb  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/index.adv  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/adv.exc  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/index.verb  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/data.adj  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/index.adj  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/LICENSE  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/citation.bib  
  inflating: /kaggle/working/nltk_data/c

In [3]:
dict = pd.read_json("/kaggle/input/yelp-dataset/yelp_academic_dataset_tip.json",lines=True)  

In [4]:
texts_len = len(list(dict['text']))
print(texts_len )

908915


In [5]:
texts = list(dict['text'])
sentences=texts[:10000]

In [6]:
def preprocess_text(text):
    
    # Cleaning data from symbols or characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Lowercasing
    text = text.lower()
    
    # Tokenization
    words = word_tokenize(text)
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Removing punctuation and non-alphanumeric characters
    words = [re.sub(r'[^\w\s]', '', word) for word in words]
    
    # Stop words removal
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    
    # Unique words
    unique_words = set(filtered_words)
    
    return words

In [7]:
processed_sentences = []
for text in sentences:
    processed_sentences.append(preprocess_text(text))

In [8]:
print(processed_sentences[:10])

[['avenger', 'time', 'with', 'the', 'lady'], ['they', 'have', 'lot', 'of', 'good', 'desert', 'and', 'tasty', 'cuban', 'sandwich'], ['it', 'open', 'even', 'when', 'you', 'think', 'it', 'isnt'], ['very', 'decent', 'fried', 'chicken'], ['appetizer', 'platter', 'special', 'for', 'lunch'], ['chili', 'cup', 'single', 'cheeseburger', 'with', 'onion', 'pickle', 'and', 'relish', 'vanilla', 'cocacolaso', 'far'], ['saturday', 'dec', 'th', 'ride', 'patcos', 'silver', 'sleigh', 'w', 'santa', 'his', 'elf', 'on', 'a', 'decorated', 'train', 'into', 'center', 'city', 'train', 'leave', 'from', 'lindenwold', 'at', 'am', 'am', 'pm', 'and', 'make', 'all', 'stop', 'great', 'for', 'kid'], ['this', 'is', 'probably', 'the', 'best', 'place', 'in', 'the', 'cool', 'spring', 'area', 'to', 'watch', 'a', 'game', 'and', 'eat'], ['taco'], ['starbucks', 'substitute', 'in', 'boring', 'downtown', 'tampa', 'ugh', 'never', 'again']]


In [9]:
# Train FastText model
fastText = FastText(sentences=processed_sentences,vector_size=60, window=5, min_count=1, workers=4, sg=1,epochs=600)

In [10]:
word_embedding = fastText.wv['avenger']
print(word_embedding)

[ 0.985952   -0.48553807  0.690496    0.7139022  -0.41041648  1.2770162
 -0.10673486 -1.0328811  -0.12324575  1.7380464   0.3884869   1.2143704
  0.97207385 -0.5887424  -0.6210069   1.5537021  -0.21318029 -0.02804412
  0.10879805  0.62352145 -1.3533564   0.5759854   1.7204332  -0.7468448
  0.80386734  0.51349473  1.3687959   0.32635432 -0.69173235 -0.60936147
  0.2949248   0.2951804   0.48718238 -1.8125699   0.3388758  -0.4054269
 -1.0899638  -0.5166557   0.85261196  0.48995134  0.17882189 -0.4056914
  0.4342933  -1.3543568   0.17057644 -0.5642149   1.6906837   1.2856896
 -0.54113895 -1.1481752   0.21151543 -0.03168796  0.9412034   0.09158234
  0.5228011   0.5586444   1.1447388   1.0042685   1.758529   -0.987737  ]


In [4]:
random_words = ["Banana", "Elephant", "Chair", "Mountain", "Guitar", "Sunshine", "Telescope", 
         "Pillow", "Whisper", "Ocean", "Butterfly", "Keyboard", "Firefly", "Umbrella", 
         "Chocolate", "Bicycle", "Rainbow", "Diamond", "Universe", "Pillowcase", "Dragon", 
         "Marshmallow", "Adventure", "Backpack", "Symphony", "Candle", "Pineapple", 
         "Jellyfish", "Tornado", "Moonlight"]


In [12]:
for word in random_words:
    # Print top10 similar words 
    print(f"top10 similar words to '{word}':")
    similar_words = fastText.wv.most_similar(word, topn=10)
    for sim_word, sim_score in similar_words:
        print(f"{sim_word}: {sim_score:.4f}")
    print('--------------------------')

top10 similar words to 'Banana':
banana: 0.9217
nannananannnannmananaa: 0.8752
gotham: 0.7056
goood: 0.6911
foster: 0.6803
matzo: 0.6704
jenna: 0.6665
freddo: 0.6599
omgthe: 0.6530
saladno: 0.6501
--------------------------
top10 similar words to 'Elephant':
elephant: 0.9684
assistance: 0.6712
mistaking: 0.6712
unhelpful: 0.6645
stomache: 0.6367
linen: 0.6339
wawas: 0.6302
stomach: 0.6293
assistant: 0.6272
resistance: 0.6178
--------------------------
top10 similar words to 'Chair':
chair: 0.7593
marble: 0.6798
appeared: 0.6692
hair: 0.6677
chairman: 0.6670
chickfila: 0.6663
cartsnacks: 0.6643
bootcamp: 0.6608
woven: 0.6535
grimy: 0.6530
--------------------------
top10 similar words to 'Mountain':
fountain: 0.9218
mountain: 0.8587
plantain: 0.6865
selfservice: 0.6329
marble: 0.6302
admission: 0.6224
content: 0.6213
tumbler: 0.6112
broadcast: 0.6065
hemingway: 0.6001
--------------------------
top10 similar words to 'Guitar':
guitar: 0.9230
guitarist: 0.7893
tar: 0.6709
browsing: 0.663

In [14]:
for words in random_words:
    # Print top10 dissimilar words 
    print(f"top10 dissimilar words to '{words}':")
    dissimilar_words = fastText.wv.most_similar(negative=[words], topn=10)
    for dissim_word, dissim_score in dissimilar_words:
        print(f"{dissim_word}: {dissim_score:.4f}")
    print('----------------------------------')

top10 dissimilar words to 'Banana':
venue: 0.0398
equipment: 0.0360
cheap: 0.0275
view: 0.0131
couple: 0.0096
more: -0.0029
bag: -0.0270
low: -0.0272
truck: -0.0309
finding: -0.0358
----------------------------------
top10 dissimilar words to 'Elephant':
local: 0.0107
sake: -0.0149
mark: -0.0331
winning: -0.0416
on: -0.0443
outstanding: -0.0452
both: -0.0460
squeeze: -0.0575
peach: -0.0577
thursday: -0.0584
----------------------------------
top10 dissimilar words to 'Chair':
yum: 0.0133
yes: -0.0217
after: -0.0372
having: -0.0491
trying: -0.0546
might: -0.0610
midnight: -0.0669
should: -0.0671
die: -0.0685
mayo: -0.0746
----------------------------------
top10 dissimilar words to 'Mountain':
platter: 0.0153
cheese: 0.0067
sweet: 0.0006
burger: -0.0057
mojo: -0.0110
craving: -0.0235
were: -0.0255
that: -0.0260
joke: -0.0266
sandwich: -0.0355
----------------------------------
top10 dissimilar words to 'Guitar':
since: -0.0382
leg: -0.0587
universe: -0.0635
university: -0.0694
last: -0.

In [2]:
import fasttext
from huggingface_hub import hf_hub_download

#load pretrained facebook model for English words 'only'
model_path = hf_hub_download(repo_id="facebook/fasttext-et-vectors", filename="model.bin")
model = fasttext.load_model(model_path)

model.bin:   0%|          | 0.00/7.24G [00:00<?, ?B/s]



In [8]:
for word in random_words:
    # Print top10 similar words 
    print(f"top10 similar words to '{word}':")
    similar_words = model.get_nearest_neighbors(word, k=10)
    for sim_word, sim_score in similar_words:
        print(f"{sim_score}: {sim_word}")
    print('--------------------------')

top10 similar words to 'Banana':
Melt-Banana: 0.7825111746788025
Bananas: 0.7369200587272644
Manana: 0.7117103338241577
Banan: 0.689479649066925
Bananasilk: 0.6837583184242249
banana: 0.6514955163002014
Bananal: 0.6430939435958862
Bealanana: 0.6292361617088318
Tanana: 0.6283255219459534
manana: 0.6279790997505188
--------------------------
top10 similar words to 'Elephant':
elephant: 0.8018065094947815
Elephants: 0.7315071821212769
Elephunk: 0.6580228805541992
elephants: 0.6488975286483765
Elephantine: 0.6359829902648926
Elephandt: 0.6354495286941528
Eléphant: 0.6244930028915405
Elephas: 0.6066911816596985
elephantopus: 0.5800499320030212
430x360x92: 0.5627616047859192
--------------------------
top10 similar words to 'Chair':
Chairá: 0.7701334953308105
Chaira: 0.7142198085784912
Chairi: 0.678917646408081
Chairon: 0.6779542565345764
Chairs: 0.6754738092422485
Armchair: 0.6708164811134338
ChaiYau: 0.6594820618629456
mohair: 0.6559171080589294
Chaix: 0.6532657742500305
wheelchair: 0.6509

In [11]:
for words in random_words:
    # Print top10 dissimilar words 
    print(f"top10 dissimilar words to '{words}':")
    similar_words = model.get_nearest_neighbors(words, k=len(model.words))
    # Sort the similar_words list by similarity score
    similar_words_sorted = sorted(similar_words, key=lambda x: x[1], reverse=True)
    for sim_word, sim_score in similar_words[-10:]:
        print(f"{sim_score}: {sim_word}")
    print('----------------------------------')

top10 dissimilar words to 'Banana':
TPÃœ: -0.3627517521381378
: -0.36320987343788147
Тип: -0.3648880124092102
̲: -0.3671763837337494
окт: -0.3703448176383972
rω2: -0.37510403990745544
ÕKE: -0.37857621908187866
s.k.p.: -0.38327234983444214
Т.Э.: -0.39532211422920227
Р.: -0.3973846137523651
----------------------------------
top10 dissimilar words to 'Elephant':
NNSV: -0.3399503231048584
464СС: -0.3418683707714081
6-А: -0.34243568778038025
m2m: -0.3435748815536499
地官: -0.3436238467693329
ODUs: -0.3437238335609436
RRis: -0.3442862033843994
EVC: -0.34511634707450867
DCTV: -0.3505041003227234
KMi: -0.3691246807575226
----------------------------------
top10 dissimilar words to 'Chair':
Aru-: -0.31418243050575256
AgZn: -0.31466126441955566
XMM15: -0.31498265266418457
POTS: -0.3160350024700165
Šerr: -0.31936177611351013
NiH: -0.32311105728149414
442.: -0.32430708408355713
rähka: -0.32784029841423035
Kyne: -0.3318037688732147
ojode: -0.3370569944381714
----------------------------------
top10