## Importing Libraries

In [11]:
import pandas as pd
import re
from nltk.corpus import stopwords
from gensim.models.fasttext import FastText # build and train Fast Text model
from gensim.models import Word2Vec # to Save and Load Word2Vec models
from gensim.models.fasttext import load_facebook_model
from textblob import Word
import spacy
import numpy as np
import nltk
import string
from nltk.stem.wordnet import WordNetLemmatizer

## Preprocessing

In [5]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
def textPreprocessing(text):

  # Noramlization
  text = text.lower()
  text = re.sub(r'\n', '', text) # Remove new lines (\n)
  # Remove Punctuation
  translator = str.maketrans('', '', string.punctuation)
  text = text.translate(translator)

  # Tokenization
  text = text.split()
  # Removing Stop Words
  useless_words = nltk.corpus.stopwords.words("english")
  useless_words = useless_words + ['«', '»','©']
  text = [word for word in text if not word in useless_words]

  # # Cleaning
  # text = list(set(text))
  # Removing Numbers and words with Numbers
  text_filtered = [re.sub(r'\w*\d\w*', '', w) for w in text]
  text_filtered = [re.sub(r'\d', '', w) for w in text_filtered]
  text_filtered = [re.sub(r'\s+[a-zA-Z]\s+', '', w) for w in text_filtered ]
  
  
  # Removing empty strings
  text_filtered = [x for x in text_filtered if x != '']

  # Lemmatization
  nlp = spacy.load("en_core_web_sm")
  text_lem=[]
  for wordq in text_filtered:
    s=nlp(wordq)
    se = [token.lemma_ for token in s][0]
#     word = Word(se)
#     ss=word.lemmatize("v")
    # print(f"{wordq} ---> {word} ------> ",ss )
    text_lem.append(se)
  text_lem = [word for word in text_lem if len(word) > 2]
  # Get Unique Words
  text_lem = list(set(text_lem))

  return text_lem

## Load Dataset

In [7]:
yelp_datafile = pd.read_json("/kaggle/input/yelp-dataset/yelp_academic_dataset_tip.json", lines=True)

In [8]:
text=np.array(yelp_datafile['text'][0:1000])

In [9]:
listText=list(text)

## Apply Preprocessing

In [10]:
Preprocessed_Text=[textPreprocessing(texts) for texts in listText]

## FastText

In [12]:
def train_Fasttext(word_tokens,embedding_size,window_size,min_word,down_sampling,Save_model_filename):
    
  fast_Text_model = FastText(word_tokens,
                                 vector_size=embedding_size, # Dimensionality of the word vectors. ,
                                 window=window_size,
                                 min_count=min_word, # The model ignores all words with total frequency lower than this.
                                 sample=down_sampling, # threshold which higher-frequency words are randomly down sampled
                                 workers = 4, # Num threads to train the model (faster training with multicore comp.)
                                 sg=1, # Training algorithm: skip-gram if sg=1, otherwise CBOW.
                                 epochs=100) # Number of iterations (epochs) over the corpus
    
  fast_Text_model.save(Save_model_filename) # Save fastText gensim model

In [13]:
embedding_size = 300
window_size = 5
min_word = 5
down_sampling = 1e-2
train_Fasttext(Preprocessed_Text,embedding_size,window_size,min_word,down_sampling,'/kaggle/working/model')

In [16]:
fast_Text_model = Word2Vec.load('/kaggle/working/model')

In [17]:
wwords=fast_Text_model.wv.index_to_key

In [25]:
import random

testno=20

random_numbers = random.sample(range(len(wwords)), testno)

# print(random_numbers)



In [58]:
random_numbers=[76, 78, 167, 95, 165, 149, 101, 89, 80, 246, 34, 142, 247, 252, 38, 154, 49, 133, 57, 135]

[76,
 78,
 167,
 95,
 165,
 149,
 101,
 89,
 80,
 246,
 34,
 142,
 247,
 252,
 38,
 154,
 49,
 133,
 57,
 135]

In [26]:
simopp=[]
simoppwords=[]
for i in random_numbers:
    ww=wwords[i]
    similarwords=fast_Text_model.wv.most_similar(ww, topn=10)
    oppositewords=fast_Text_model.wv.most_similar(negative=ww, topn=10)
    print('similar words of',ww)
    simopp.append(ww)
    a=[]
    b=[]
    for similar,percent in similarwords:
        print('(',similar,') similarity:',percent)
        a.append(similar)
    print('opposite words of',ww)
    for opp,percent in oppositewords:
        print('(',opp,') similarity:',percent)
        b.append(opp)
    simoppwords.append(a)
    simoppwords.append(b)
    print('....................................................')

similar words of onion
( pickle ) similarity: 0.678815484046936
( enjoy ) similarity: 0.6416696906089783
( beef ) similarity: 0.6304275393486023
( option ) similarity: 0.6120758652687073
( cheese ) similarity: 0.5918620228767395
( portion ) similarity: 0.5595524311065674
( chili ) similarity: 0.5392857193946838
( must ) similarity: 0.5369822978973389
( wednesday ) similarity: 0.5224050879478455
( choice ) similarity: 0.5183122754096985
opposite words of onion
( sushi ) similarity: 0.0032931293826550245
( nice ) similarity: -0.0016629263991490006
( place ) similarity: -0.016874173656105995
( sure ) similarity: -0.020855389535427094
( wait ) similarity: -0.02393491379916668
( yet ) similarity: -0.03249121084809303
( office ) similarity: -0.036814603954553604
( that ) similarity: -0.04380433261394501
( get ) similarity: -0.05201029032468796
( seat ) similarity: -0.055999789386987686
....................................................
similar words of home
( town ) similarity: 0.555990040

In [28]:
count=0
for i in range(len(simopp)):
    print('Word: ',simopp[i])
    print('Similar:',simoppwords[count])
    count+=1
    print('Dissimilar:',simoppwords[count])
    count+=1

Word:  onion
Similar: ['pickle', 'enjoy', 'beef', 'option', 'cheese', 'portion', 'chili', 'must', 'wednesday', 'choice']
Dissimilar: ['sushi', 'nice', 'place', 'sure', 'wait', 'yet', 'office', 'that', 'get', 'seat']
Word:  home
Similar: ['town', 'authentic', 'pack', 'eat', 'dine', 'lot', 'come', 'selection', 'restaurant', 'style']
Dissimilar: ['fry', 'offer', 'work', 'white', 'class', 'cream', 'shrimp', 'yum', 'busy', 'ice']
Word:  park
Similar: ['parking', 'cuban', 'plus', 'bean', 'option', 'next', 'patio', 'kid', 'dog', 'item']
Dissimilar: ['beer', 'yummy', 'monday', 'server', 'end', 'bad', 'look', 'thank', 'taste', 'fish']
Word:  wait
Similar: ['ahead', 'less', 'minute', 'sure', 'seat', 'huge', 'long', 'use', 'quick', 'else']
Dissimilar: ['cake', 'cold', 'store', 'spring', 'roll', 'fast', 'onion', 'much', 'try', 'start']
Word:  brew
Similar: ['cuban', 'bean', 'breakfast', 'option', 'italian', 'bread', 'large', 'sandwich', 'family', 'coffee']
Dissimilar: ['would', 'could', 'show', 's

## Fast-Text Pretrained Model

In [29]:
pretrained_fastText_en = load_facebook_model('/kaggle/input/cc-en-300-bin/cc.en.300.bin')


In [30]:
simopp=[]
simoppwords=[]
for i in random_numbers:
    ww=wwords[i]
    similarwords=pretrained_fastText_en.wv.most_similar(ww, topn=10)
    oppositewords=pretrained_fastText_en.wv.most_similar(negative=ww, topn=10)
    print('similar words of',ww)
    simopp.append(ww)
    a=[]
    b=[]
    for similar,percent in similarwords:
        print('(',similar,') similarity:',percent)
        a.append(similar)
    print('opposite words of',ww)
    for opp,percent in oppositewords:
        print('(',opp,') similarity:',percent)
        b.append(opp)
    simoppwords.append(a)
    simoppwords.append(b)
    print('....................................................')

similar words of onion
( onions ) similarity: 0.8156759738922119
( onion. ) similarity: 0.7122210264205933
( garlic ) similarity: 0.69205641746521
( onion- ) similarity: 0.6882799863815308
( leek ) similarity: 0.6523008942604065
( onions. ) similarity: 0.6427907943725586
( shallots ) similarity: 0.6304621696472168
( onions- ) similarity: 0.6238100528717041
( scallion ) similarity: 0.6161937713623047
( bellpepper ) similarity: 0.6146571636199951
opposite words of onion
( Wingspan ) similarity: 0.18997180461883545
( Positivo ) similarity: 0.1795070767402649
( Qubo ) similarity: 0.17929282784461975
( Projeto ) similarity: 0.17653656005859375
( Pelotas ) similarity: 0.1764206886291504
( Compatibility ) similarity: 0.17552024126052856
( Natal ) similarity: 0.17005963623523712
( NATAL ) similarity: 0.16673892736434937
( Novo ) similarity: 0.16502691805362701
( NOTINHERITED ) similarity: 0.1614864021539688
....................................................
similar words of home
( house ) si

In [31]:
count=0
for i in range(len(simopp)):
    print('Word: ',simopp[i])
    print('Similar: ',simoppwords[count])
    count+=1
    print('Dissimilar: ',simoppwords[count])
    count+=1

Word:  onion
Similar:  ['onions', 'onion.', 'garlic', 'onion-', 'leek', 'onions.', 'shallots', 'onions-', 'scallion', 'bellpepper']
Dissimilar:  ['Wingspan', 'Positivo', 'Qubo', 'Projeto', 'Pelotas', 'Compatibility', 'Natal', 'NATAL', 'Novo', 'NOTINHERITED']
Word:  home
Similar:  ['house', 'home.The', 'home-', 'home.When', 'home.This', 'homee', 'home.So', 'home.It', 'home.While', 'home.']
Dissimilar:  ['Ningauble', 'Cerebellum', 'SoWhy', 'Ymblanter', 'Masem', 'Stifle', 'Bmf', 'Alsee', 'Anmccaff', 'Sandstein']
Word:  park
Similar:  ['parks', 'park.The', 'park.It', 'park.This', 'park.But', 'park.As', 'park.I', 'park.', 'park.A', 'park.So']
Dissimilar:  ['Zealotry', 'TORR', 'GuideTerms', 'A2X', 'Amatus', 'Veracious', 'Eviscerates', 'Steth', 'JohnC5', 'OFN']
Word:  wait
Similar:  ['waiting', 'wait.So', 'waited', 'WAIT', 'wait.', 'wait.And', 'wait.But', 'wait.You', 'wait.I', 'paitently']
Dissimilar:  ['Coarsening', 'ENTITIES', 'ProductsBed', 'ATMOSPHERIC', 'Convex', 'Halloy', 'wardrobes.Bed

In [32]:
pretrained_fastText_en.build_vocab(wwords, update = True)

In [33]:
pretrained_fastText_en.train(wwords, total_examples=pretrained_fastText_en.corpus_count, epochs=100) 

(2461, 135800)

In [34]:
for i in random_numbers:
    ww=wwords[i]
    similarwords=pretrained_fastText_en.wv.most_similar(ww, topn=10)
    oppositewords=pretrained_fastText_en.wv.most_similar(negative=ww, topn=10)
    print('similar words of',ww)
    for similar,percent in similarwords:
        print('(',similar,') similarity:',percent)
    print('opposite words of',ww)
    for opp,percent in oppositewords:
        print('(',opp,') similarity:',percent)
    print('....................................................')

similar words of onion
( onions ) similarity: 0.8156759738922119
( onion. ) similarity: 0.7122210264205933
( garlic ) similarity: 0.69205641746521
( onion- ) similarity: 0.6882799863815308
( leek ) similarity: 0.6523008942604065
( onions. ) similarity: 0.6427907943725586
( shallots ) similarity: 0.6304621696472168
( onions- ) similarity: 0.6238100528717041
( scallion ) similarity: 0.6161937713623047
( bellpepper ) similarity: 0.6146571636199951
opposite words of onion
( Wingspan ) similarity: 0.18997180461883545
( Positivo ) similarity: 0.1795070767402649
( Qubo ) similarity: 0.17929282784461975
( Projeto ) similarity: 0.17653656005859375
( Pelotas ) similarity: 0.1764206886291504
( Compatibility ) similarity: 0.17552024126052856
( Natal ) similarity: 0.17005963623523712
( NATAL ) similarity: 0.16673892736434937
( Novo ) similarity: 0.16502691805362701
( NOTINHERITED ) similarity: 0.1614864021539688
....................................................
similar words of home
( house ) si