In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import json
import numpy as np
from joblib import dump, load
from gensim.models import Word2Vec

reviews_df = pd.read_json('reviews_sampled.json')
reviews = reviews_df['text'].tolist()
with open('Tags.json', 'r') as file:
    tags = json.load(file)['tags']

def preprocess_text(text):
    return text.lower()

preprocessed_reviews = [preprocess_text(review) for review in reviews]

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_reviews)

# K-means
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_matrix)
dump(kmeans, 'kmeans_model.joblib')  #save the K-means model
dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')

# Word2Vec model training and save
clustered_reviews = [[] for _ in range(num_clusters)]
for review, label in zip(reviews, kmeans.labels_):
    clustered_reviews[label].append(review)

models = []
for i, texts in enumerate(clustered_reviews):
    tokenized_texts = [text.split() for text in texts]
    model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=15, min_count=1, workers=4)
    models.append(model)
    model.save(f"word2vec_model_cluster_{i}.model")




In [4]:
#This is a example on how to use the models
from joblib import load
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
# load K-means
kmeans = load('kmeans_model.joblib')
tfidf_vectorizer = load('tfidf_vectorizer.joblib')
# load wordvec
models = [Word2Vec.load(f'word2vec_model_cluster_{i}.model') for i in range(5)]

def preprocess_text(text):
    return text.lower()

def find_closest_tags(text, model, tags, topn=3):
    preprocessed_text = preprocess_text(text)
    words = preprocessed_text.split()
    valid_words = [word for word in words if word in model.wv.key_to_index]
    if not valid_words:
        return []

    valid_tags = [tag for tag in tags if tag in model.wv.key_to_index]
    tags_vectors = np.array([model.wv[tag] for tag in valid_tags])
    similarities = []
    for tag, tag_vector in zip(valid_tags, tags_vectors):
        sim = np.mean([model.wv.similarity(word, tag) for word in valid_words])
        similarities.append((tag, sim))

    closest_tags = sorted(similarities, key=lambda x: -x[1])[:topn]
    return closest_tags

def predict_tags(text, kmeans, models, tfidf_vectorizer, tags, topn=3):
    preprocessed_text = preprocess_text(text)
    tfidf_vector = tfidf_vectorizer.transform([preprocessed_text])

    cluster_label = kmeans.predict(tfidf_vector)[0]
    word2vec_model = models[cluster_label]
    return find_closest_tags(preprocessed_text, word2vec_model, tags, topn)

with open('Tags.json', 'r') as file:
    tags = json.load(file)['tags']

new_text = "This is a delicious and quaint Korean eatery located at the end jasper Ave, with lots of parking behind the building. I was greeted immediately at the door by more than one employee. Overall, the service was excellent. Our waiter was friendly and attentive but gave us just enough to space. The restaurant is medium sized but with a very nice set up and a bar in the back. It looks very clean with carefully thought out decor."
predicted_tags = predict_tags(new_text, kmeans, models, tfidf_vectorizer, tags)
print(predicted_tags)





[('Kid-friendly', 0.16334255), ('Tasty', 0.1629617), ('Burnt', 0.15900302)]
