In [1]:
import json
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import spacy
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Initialize WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization using NLTK
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
     # Remove words with 3 characters or fewer
    tokens = [word for word in tokens if len(word) > 3]
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [4]:
# Open the file containing the JSON data
print("Opening file...")
with open("/content/yelp_academic_dataset_tip.json", "r") as file:
    print("File opened successfully.")

    preprocessed_corpus = []

    # Read each line and process it as JSON
    print("Processing JSON data...")
    for line in file:
        try:
            # Parse the JSON data from the current line
            json_data = json.loads(line)
            # Extract the "text" attribute from the JSON object
            text_attribute = json_data.get("text")
            # Preprocess the text
            preprocessed_text = preprocess_text(text_attribute)
            # Append the preprocessed text to the corpus
            preprocessed_corpus.append(preprocessed_text)
        except json.JSONDecodeError:
            print("Error: Failed to parse JSON data in line:", line)
    print("JSON data processed successfully.")

print("Number of preprocessed text attributes in the corpus:", len(preprocessed_corpus))


Opening file...
File opened successfully.
Processing JSON data...
Error: Failed to parse JSON data in line: "business_id":"mlaj6BRLOVtlEMRTZhUVvw","text":"You can never go wrong with the roasted veggie sandwich.","date":"2010-11-23 17:33:55","compliment_count":0}

Error: Failed to parse JSON data in line: {"user_id":"a93yzcakAnlq0hwpEL_MGg","business_id":"
JSON data processed successfully.
Number of preprocessed text attributes in the corpus: 138008


In [5]:
import json
from gensim.models.fasttext import FastText
from gensim.utils import simple_preprocess

# Tokenize the preprocessed text
tokenized_corpus = [simple_preprocess(text) for text in preprocessed_corpus]

# Set parameters for the FastText model
vector_size = 100
window_size = 5
min_count = 3
sg = 3

# Create and train the FastText model
print("Training FastText model...")
model = FastText(
    sentences=tokenized_corpus,
    vector_size=vector_size,
    window=window_size,
    min_count=min_count,
    sg=sg
)
print("FastText model trained successfully.")
model.save("fasttext_model")

Training FastText model...
FastText model trained successfully.


In [6]:
# Load the trained FastText model
print("Loading trained FastText model...")
model = FastText.load("fasttext_model")
print("Model loaded successfully.")



Loading trained FastText model...
Model loaded successfully.


In [7]:
similar_words = model.wv.most_similar("love", topn=10)
# Print the similar words
print("Words similar to 'love':")
for word, similarity in similar_words:
    print(f"{word}: {similarity}")


Words similar to 'love':
lovelove: 0.9332840442657471
lovey: 0.8930991291999817
lovin: 0.8246676325798035
lover: 0.7998213768005371
loved: 0.7877277135848999
awesome: 0.7859285473823547
clove: 0.784860372543335
glove: 0.7801545262336731
fantabulous: 0.7760823965072632
amazinggg: 0.7753641605377197


In [8]:
# Function to find nearest words to a given list of words
def find_nearest_words(words_list, topn=10):
    nearest_words_dict = {}
    for word in words_list:
        try:
            # Find nearest words
            nearest_words = model.wv.most_similar(word, topn=topn)
            nearest_words_dict[word] = nearest_words
        except KeyError:
            nearest_words_dict[word] = f"'{word}' not in vocabulary."
    return nearest_words_dict

input_words = ["stop", "play", "delicious", "eat", "game", "self", "phone", "grow", "happy", "sad",
               "fun", "interesting", "boring", "exciting", "tired", "sleepy", "energetic", "lazy",
               "book", "movie", "music", "dance", "sing", "art"]
nearest_words_dict = find_nearest_words(input_words)

# Print the nearest words and their similarity scores for each input word
for word, nearest_words in nearest_words_dict.items():
    print(f"Nearest words to '{word}':")
    for nearest_word, similarity_score in nearest_words:
        print(f"Word: {nearest_word}, Similarity: {similarity_score}")
    print()


Loading trained FastText model...
Model loaded successfully.
Nearest words to 'stop':
Word: pokestop, Similarity: 0.7930882573127747
Word: stopped, Similarity: 0.7513830661773682
Word: stoudts, Similarity: 0.7495892643928528
Word: stove, Similarity: 0.743057131767273
Word: staple, Similarity: 0.741254985332489
Word: stollen, Similarity: 0.7346358299255371
Word: avis, Similarity: 0.7276335954666138
Word: ihop, Similarity: 0.7270922064781189
Word: grab, Similarity: 0.7164693474769592
Word: lucas, Similarity: 0.7155646681785583

Nearest words to 'play':
Word: playdoh, Similarity: 0.9583803415298462
Word: playin, Similarity: 0.9358537197113037
Word: playoff, Similarity: 0.9121178388595581
Word: playing, Similarity: 0.8847506642341614
Word: playground, Similarity: 0.8666591644287109
Word: playtime, Similarity: 0.8618835210800171
Word: player, Similarity: 0.861743152141571
Word: video, Similarity: 0.8402920961380005
Word: played, Similarity: 0.8283042311668396
Word: juke, Similarity: 0.82830

In [12]:
def find_furthest_words(words_list, topn=10):
    furthest_words_dict = {}
    for word in words_list:
        try:
            # the word vector
            word_vector = model.wv[word]
            # furthest words
            furthest_words = model.wv.similar_by_vector(-word_vector, topn=topn)
            furthest_words_dict[word] = furthest_words
        except KeyError:
            furthest_words_dict[word] = f"'{word}' not in vocabulary."
    return furthest_words_dict

input_words = ["stop", "play", "delicious", "eat", "game", "self", "phone", "grow", "happy", "sad",
               "fun", "interesting", "boring", "exciting", "tired", "sleepy", "energetic", "lazy",
               "book", "movie", "music", "dance", "sing", "art"]
furthest_words_dict = find_furthest_words(input_words)

# Print the furthest words and their similarity scores for each input word
for word, furthest_words in furthest_words_dict.items():
    print(f"Furthest words from '{word}':")
    for furthest_word, similarity_score in furthest_words:
        print(f"Word: {furthest_word}, Similarity: {similarity_score}")
    print()

Furthest words from 'stop':
Word: onion, Similarity: -0.26433679461479187
Word: cant, Similarity: -0.2758829891681671
Word: vegetarian, Similarity: -0.29461660981178284
Word: hair, Similarity: -0.2957654297351837
Word: doesnt, Similarity: -0.2967579960823059
Word: toni, Similarity: -0.29765549302101135
Word: tender, Similarity: -0.3015691637992859
Word: manager, Similarity: -0.31387823820114136
Word: rice, Similarity: -0.3158303499221802
Word: burger, Similarity: -0.31654056906700134

Furthest words from 'play':
Word: tasted, Similarity: -0.17939837276935577
Word: toasted, Similarity: -0.20852871239185333
Word: aged, Similarity: -0.21130476891994476
Word: crusted, Similarity: -0.21937043964862823
Word: worst, Similarity: -0.22429533302783966
Word: addicted, Similarity: -0.22733747959136963
Word: rated, Similarity: -0.23155486583709717
Word: ever, Similarity: -0.23361249268054962
Word: messed, Similarity: -0.2352067232131958
Word: addict, Similarity: -0.23688194155693054

Furthest words