In [2]:
# Installing reportlab which is used for creating PDFs
! pip install reportlab

Collecting reportlab
  Downloading reportlab-4.2.0-py3-none-any.whl.metadata (1.4 kB)
Collecting chardet (from reportlab)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading reportlab-4.2.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading chardet-5.2.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: chardet, reportlab
Successfully installed chardet-5.2.0 reportlab-4.2.0


In [3]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from gensim.models.fasttext import FastText # build and train Fast Text model
from gensim.models import Word2Vec # to Load the saved model
from gensim.models.fasttext import load_facebook_model
from tabulate import tabulate
import random
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet

# Downloading the pre-trained model from a website
! wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
! gunzip "cc.en.300.bin.gz"
nltk.download('wordnet', "/kaggle/working/nltk_data/")
nltk.download('omw-1.4', "/kaggle/working/nltk_data/")
! unzip /kaggle/working/nltk_data/corpora/wordnet.zip -d /kaggle/working/nltk_data/corpora
! unzip /kaggle/working/nltk_data/corpora/omw-1.4.zip -d /kaggle/working/nltk_data/corpora
# Adding this path to nltk so it can observe the files of the packages in it
nltk.data.path.append("/kaggle/working/nltk_data/")

from nltk.corpus import stopwords

# Download English stopwords
nltk.download('stopwords')

# Load English stopwords
english_stopwords = set(stopwords.words('english'))

--2024-04-22 09:19:25--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.163.189.14, 3.163.189.108, 3.163.189.51, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.163.189.14|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: 'cc.en.300.bin.gz'


2024-04-22 09:20:30 (65.9 MB/s) - 'cc.en.300.bin.gz' saved [4503593528/4503593528]

[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data/...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /kaggle/working/nltk_data/...
Archive:  /kaggle/working/nltk_data/corpora/wordnet.zip
   creating: /kaggle/working/nltk_data/corpora/wordnet/
  inflating: /kaggle/working/nltk_data/corpora/wordnet/lexnames  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/data.verb  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/index.ad

# Handling Yelp dataset

In [4]:
# Handling the Yelp Dataset
data_file_path = "/kaggle/input/yelp_academic_dataset_tip.json"
data_file_name = "yelp_academic_dataset_tip.json"
yelp_datafile = pd.read_json(data_file_path, lines=True)
print('List of all columns')
print(list(yelp_datafile))

List of all columns
['user_id', 'business_id', 'text', 'date', 'compliment_count']


# Taking subset for Gensim model

In [5]:

# Subset data for gensim fastText model
all_sentences = list(yelp_datafile['text']) # select "text" column only
part_of_sentences = all_sentences[0:3000] # select the first 3000 sample lines
print("\nSamples of Sentences\n [{}]".format(part_of_sentences[0:10]))


Samples of Sentences
 [['Avengers time with the ladies.', 'They have lots of good deserts and tasty cuban sandwiches', "It's open even when you think it isn't", 'Very decent fried chicken', 'Appetizers.. platter special for lunch', 'Chili Cup + Single Cheeseburger with onion, pickle, and relish + Vanilla Coca-Cola...so far.', "Saturday, Dec 7th 2013, ride Patco's Silver Sleigh w/ Santa & his elves on a decorated train into Center City. Trains leave from Lindenwold at 10am, 11:15am, & 12:30pm, and make all stops. Great for kids!", 'This is probably the best place in the cool Springs area to watch a game and eat', 'Tacos', 'Starbucks substitute in boring downtown Tampa. Ugh. Never again!']]


# Preprocessing the subset we took

In [6]:
lemmatizer = WordNetLemmatizer()
def process_text(document):
    
    document = re.sub(r'[^a-zA-Z0-9\s]', '', document)# Remove non-alphanumeric characters
    
    document = re.sub(r'\w\d\w', '', document)# Removing words that have numbers in them
    
    document = re.sub(r'[0-9]+', '', document) # Removing digits
    
    document = re.sub(r'\s+', ' ', document, flags=re.I) # Remove extra white space from text

    document = re.sub(r'\W', ' ', str(document)) # Remove all the special characters from text

    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document) # Remove all single characters from text
    
    document = document.lower() # Converting to Lowercase
    
    # Word tokenization 
    tokens = document.split()
    
    # Applying lemmatization
    lemma_txt = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Removing stopping words
    lemma_no_stop_txt = [word for word in lemma_txt if word not in english_stopwords]
    
    # Drop words less than 3 characters
    tokens = [word for word in tokens if len(word) > 3]
    
    # Getting unique words
    indices = np.unique(tokens, return_index=True)[1]
    cleaned_unique = np.array(tokens)[indices].tolist()
    
    return cleaned_unique
cleaned_reviews = [ process_text(document) for document in part_of_sentences]

In [7]:
# Print the first 15 processed reviews
for idx,document in enumerate(cleaned_reviews[0:15]):
    print(f"Review {idx+1}: {document}")

Review 1: ['avengers', 'ladies', 'time', 'with']
Review 2: ['cuban', 'deserts', 'good', 'have', 'lots', 'sandwiches', 'tasty', 'they']
Review 3: ['even', 'isnt', 'open', 'think', 'when']
Review 4: ['chicken', 'decent', 'fried', 'very']
Review 5: ['appetizers', 'lunch', 'platter', 'special']
Review 6: ['cheeseburger', 'chili', 'cocacolaso', 'onion', 'pickle', 'relish', 'single', 'vanilla', 'with']
Review 7: ['center', 'city', 'decorated', 'elves', 'from', 'great', 'into', 'kids', 'leave', 'lindenwold', 'make', 'patcos', 'ride', 'santa', 'saturday', 'silver', 'sleigh', 'stops', 'train', 'trains']
Review 8: ['area', 'best', 'cool', 'game', 'place', 'probably', 'springs', 'this', 'watch']
Review 9: ['tacos']
Review 10: ['again', 'boring', 'downtown', 'never', 'starbucks', 'substitute', 'tampa']
Review 11: ['order', 'soup', 'tortilla']
Review 12: ['back', 'coming', 'definitely', 'good', 'very', 'will']
Review 13: ['hotlight', 'must', 'stop']
Review 14: ['lets', 'yankees']
Review 15: ['basic

# Train our FastText model

In [8]:
def train_Fasttext(sentences,embedding_size,window_size,min_word,down_sampling,epochs,Save_model_filename):
    fast_Text_model = FastText(sentences,
    vector_size=embedding_size, # Dimensionality of the word vectors. ,
    window=window_size,
    min_count=min_word, # The model ignores all words with total frequency lower than this.
    sample=down_sampling, # threshold which higher-frequency words are randomly down sampled
    workers = 4, # Num threads to train the model (faster training with multicore comp.)
    sg=1, # Training algorithm: skip-gram if sg=1, otherwise CBOW.
    epochs=epochs) # Number of iterations (epochs) over the corpus

    fast_Text_model.save(Save_model_filename) # Save fastText gensim model

In [9]:
# selected values for Training parameters
embedding_size = 500
window_size = 4
min_word = 4
down_sampling = 1e-2
epochs=200

train_Fasttext(cleaned_reviews,embedding_size,window_size,min_word,down_sampling,epochs,"Custom_FastText")

In [10]:
# Load saved gensim fastText model
fast_Text_model = Word2Vec.load("/kaggle/working/Custom_FastText")

# Load Pre-trained model

In [11]:
# Load pretrained fastText word embeddings
pretrained_fastText_en = load_facebook_model('/kaggle/working/cc.en.300.bin')

# Collect word in our model vocabulary

In [12]:
words = list(fast_Text_model.wv.key_to_index)  # Collect words from the model's vocabulary

In [13]:
print(words)



# Printing some random words results to see models performance

In [14]:
import random

# Define a function to find top similar and dissimilar words
def find_top_n(word, word_list, model, n=10):
    similarities = model.wv.most_similar(word, topn=n)
    return similarities

# Loop to process words
for word in random.sample(words, 20):  # Take 20 random words from 'words' list
    print(f"Analyzing word: {word}\n")  # Print the word being analyzed

    # Custom Model
    similar_custom = find_top_n(word, words, fast_Text_model)
    dissimilar_custom = find_top_n(word, words, fast_Text_model, n=1000)[-10:]  # Get last 10 dissimilar
    
    # Pretrained Model
    similar_pretrained = find_top_n(word, words, pretrained_fastText_en)
    dissimilar_pretrained = find_top_n(word, words, pretrained_fastText_en, n=1000)[-10:]  # Get last 10 dissimilar
    
    # Print the output
    print("Top 10 similar words (custom model):")
    for similar_word, similarity in similar_custom:
        print(f"{similar_word}: {similarity:.4f}")
    
    print("\nTop 10 opposite words (custom model):")
    for opposite_word, similarity in dissimilar_custom:
        print(f"{opposite_word}: {similarity:.4f}")

    print("\nTop 10 similar words (pre-trained model):")
    for similar_word, similarity in similar_pretrained:
        print(f"{similar_word}: {similarity:.4f}")

    print("\nTop 10 opposite words (pre-trained model):")
    for opposite_word, similarity in dissimilar_pretrained:
        print(f"{opposite_word}: {similarity:.4f}")
    
    print("\n" + "-"*40 + "\n")  # Separator for readability


Analyzing word: behind

Top 10 similar words (custom model):
beignets: 0.5661
dude: 0.5193
cashier: 0.5035
alcohol: 0.4847
beware: 0.4794
mind: 0.4752
find: 0.4688
youll: 0.4672
brand: 0.4671
bathroom: 0.4584

Top 10 opposite words (custom model):
order: 0.0264
variety: 0.0250
lunch: 0.0181
such: 0.0106
with: 0.0106
fresh: 0.0025
together: 0.0001
sliders: -0.0033
yogurt: -0.0053
specials: -0.0079

Top 10 similar words (pre-trained model):
behing: 0.7815
beind: 0.6468
Behind: 0.6307
behin: 0.6221
behnd: 0.6189
behid: 0.6137
behindthe: 0.6007
behind.The: 0.5871
BEHIND: 0.5320
behi: 0.5294

Top 10 opposite words (pre-trained model):
swept: 0.2916
clearing: 0.2916
linger: 0.2916
house.Just: 0.2915
down.There: 0.2915
down.They: 0.2915
away.With: 0.2915
arout: 0.2915
forward.At: 0.2915
stymied: 0.2915

----------------------------------------

Analyzing word: even

Top 10 similar words (custom model):
event: 0.5197
evening: 0.4565
heaven: 0.3800
cajun: 0.3649
closing: 0.3604
advance: 0.3543


# Writting the results in pdf

In [15]:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet
import random

# Create a PDF document
pdf = SimpleDocTemplate("word_analysis_results.pdf", pagesize=letter)
styles = getSampleStyleSheet()

# Define a function to find top similar and dissimilar words
def find_top_n(word, word_list, model, n=10):
    similarities = model.wv.most_similar(word, topn=n)
    return similarities

# Define a function to write results to the PDF
def write_to_pdf(pdf, content_list):
    # Create paragraphs for each content item
    content = [Paragraph(item, styles["Normal"]) for item in content_list]
    pdf.build(content)

# List to store analysis results
analysis_results = []

# Loop to process words
for word in random.sample(words, 20):  # Take 20 random words from 'words' list
    analysis_results.append(f"Analyzing word: {word}\n")  # Add the word being analyzed

    # Custom Model
    similar_custom = find_top_n(word, words, fast_Text_model)
    dissimilar_custom = find_top_n(word, words, fast_Text_model, n=1000)[-10:]  # Get last 10 dissimilar

    # Pretrained Model
    similar_pretrained = find_top_n(word, words, pretrained_fastText_en)
    dissimilar_pretrained = find_top_n(word, words, pretrained_fastText_en, n=1000)[-10:]  # Get last 10 dissimilar

    # Add analysis results to the list
    analysis_results.append("Top 10 similar words (custom model):")
    for similar_word, similarity in similar_custom:
        analysis_results.append(f"{similar_word}: {similarity:.4f}")

    analysis_results.append("\nTop 10 opposite words (custom model):")
    for opposite_word, similarity in dissimilar_custom:
        analysis_results.append(f"{opposite_word}: {similarity:.4f}")

    analysis_results.append("\nTop 10 similar words (pre-trained model):")
    for similar_word, similarity in similar_pretrained:
        analysis_results.append(f"{similar_word}: {similarity:.4f}")

    analysis_results.append("\nTop 10 opposite words (pre-trained model):")
    for opposite_word, similarity in dissimilar_pretrained:
        analysis_results.append(f"{opposite_word}: {similarity:.4f}")

    analysis_results.append("\n" + "-"*40 + "\n")  # Separator for readability




# Conclusion of two models

In [16]:
# Add conclusion
conclusion = [
    "\n\nConclusion:",
    "In this analysis, we explored the similarity and dissimilarity of words using both a custom FastText model and a pre-trained FastText model.",
    "We found that at some words the pretrained model works better than the custom model as it really give very accurate results but at some other words the pretrained model just gives different forms of the same given word but the custom model gives diffrent words most of them are close to thegiven word meaning .",
    "Overall, the results indicate that two models works pretty good but its word dependent if the word is rare or not in the pre-trained model's vocabulary, it may not perform well. "
]

# Append conclusion to analysis_results
analysis_results += conclusion

# Write the analysis results to the PDF
write_to_pdf(pdf, analysis_results)

# Different way for writing in pdf using new different words


In [17]:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

def write_to_pdf(pdf_file, analysis_results_custom, analysis_results_pretrained):
    c = canvas.Canvas(pdf_file, pagesize=letter)
    y = 750  # Starting y position
    x_custom = 50  # Starting x position for custom model
    x_pretrained = 350  # Starting x position for pretrained model
    line_spacing = 15  # Spacing between lines
    page_height = 800  # Height of the page
    bottom_margin = 50  # Bottom margin

    def check_new_page():
        nonlocal y
        if y < bottom_margin:
            c.showPage()
            y = page_height

    for custom_line, pretrained_line in zip(analysis_results_custom, analysis_results_pretrained):
        c.drawString(x_custom, y, custom_line)
        c.drawString(x_pretrained, y, pretrained_line)
        y -= line_spacing  # Adjust y position for next line
        check_new_page()

    c.save()

# Usage
pdf_file = "analysis_results.pdf"
# Define a function to find top similar and dissimilar words
def find_top_n(word, word_list, model, n=10):
    similarities = model.wv.most_similar(word, topn=n)
    return similarities


# List to store analysis results
analysis_results_custom = []
analysis_results_pretrained = []

# Loop to process words
for word in random.sample(words, 20):  # Take 20 random words from 'words' list
    analysis_results_custom.append(f"Analyzing word: {word}\n")  # Add the word being analyzed
    analysis_results_pretrained.append(f"Analyzing word: {word}\n")

    # Custom Model
    similar_custom = find_top_n(word, words, fast_Text_model)
    dissimilar_custom = find_top_n(word, words, fast_Text_model, n=1000)[-10:]  # Get last 10 dissimilar

    # Pretrained Model
    similar_pretrained = find_top_n(word, words, pretrained_fastText_en)
    dissimilar_pretrained = find_top_n(word, words, pretrained_fastText_en, n=1000)[-10:]  # Get last 10 dissimilar

    # Add analysis results to the list
    analysis_results_custom.append("Top 10 similar words (custom model):")
    for similar_word, similarity in similar_custom:
        analysis_results_custom.append(f"{similar_word}: {similarity:.4f}")

    analysis_results_custom.append("\nTop 10 opposite words (custom model):")
    for opposite_word, similarity in dissimilar_custom:
        analysis_results_custom.append(f"{opposite_word}: {similarity:.4f}")

    analysis_results_pretrained.append("\nTop 10 similar words (pre-trained model):")
    for similar_word, similarity in similar_pretrained:
        analysis_results_pretrained.append(f"{similar_word}: {similarity:.4f}")

    analysis_results_pretrained.append("\nTop 10 opposite words (pre-trained model):")
    for opposite_word, similarity in dissimilar_pretrained:
        analysis_results_pretrained.append(f"{opposite_word}: {similarity:.4f}")

    analysis_results_pretrained.append("\n" + "-"*40 + "\n")  # Separator for readability
    analysis_results_custom.append("\n" + "-"*40 + "\n")

write_to_pdf(pdf_file, analysis_results_custom, analysis_results_pretrained)


# trying unseen random words 

In [22]:
import random


def find_top_n(word, word_list, model, n=10):
    similarities = model.wv.most_similar(word, topn=n)
    return similarities

test_words = ["hi", "university", "college", "prepare","convert","process"]

# Loop to process words
for word in test_words:  # Take unseen words from test_words
    print(f"Analyzing word: {word}\n")  # Print the word being analyzed

    # Custom Model
    similar_custom = find_top_n(word, test_words, fast_Text_model)
    dissimilar_custom = find_top_n(word, test_words, fast_Text_model, n=1000)[-10:]  # Get last 10 dissimilar
    
    # Pretrained Model
    similar_pretrained = find_top_n(word, test_words, pretrained_fastText_en)
    dissimilar_pretrained = find_top_n(word, test_words, pretrained_fastText_en, n=1000)[-10:]  # Get last 10 dissimilar
    
    # Print the output
    print("Top 10 similar words (custom model):")
    for similar_word, similarity in similar_custom:
        print(f"{similar_word}: {similarity:.4f}")
    
    print("\nTop 10 opposite words (custom model):")
    for opposite_word, similarity in dissimilar_custom:
        print(f"{opposite_word}: {similarity:.4f}")
    
    print("\nTop 10 similar words (pre-trained model):")
    for similar_word, similarity in similar_pretrained:
        print(f"{similar_word}: {similarity:.4f}")
    
    print("\nTop 10 opposite words (pre-trained model):")
    for opposite_word, similarity in dissimilar_pretrained:
        print(f"{opposite_word}: {similarity:.4f}")
    
    print("\n" + "-"*40 + "\n")  # Separator for readability




Analyzing word: hi

Top 10 similar words (custom model):
highly: 0.7362
sushi: 0.7321
high: 0.7231
recommended: 0.4403
split: 0.4176
beware: 0.4139
website: 0.3999
recommend: 0.3968
priced: 0.3934
thursday: 0.3799

Top 10 opposite words (custom model):
building: -0.0106
money: -0.0162
most: -0.0195
sign: -0.0256
located: -0.0295
pickup: -0.0307
stout: -0.0315
without: -0.0329
move: -0.0422
next: -0.0438

Top 10 similar words (pre-trained model):
hi.: 0.7208
hello: 0.7038
Hi: 0.6973
hello.: 0.6432
hi-: 0.6391
hiii: 0.6334
hiiiii: 0.6148
Hi.: 0.6143
Hi-: 0.6103
hellow: 0.6086

Top 10 opposite words (pre-trained model):
iwr: 0.4103
uuh: 0.4102
iiy: 0.4102
jL: 0.4101
loL: 0.4101
wh: 0.4100
mso: 0.4100
idont: 0.4100
amanda.: 0.4099
heI: 0.4099

----------------------------------------

Analyzing word: university

Top 10 similar words (custom model):
city: 0.6591
quality: 0.6130
unfortunately: 0.5982
seemed: 0.5834
variety: 0.5806
trust: 0.5771
unique: 0.5666
wanted: 0.5605
asian: 0.5599
tru