In [1]:
pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [2]:
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import re
import pandas as pd

In [5]:
# ----------------------
# Dataset
# ----------------------
data = pd.read_csv("tripadvisor_hotel_reviews.csv")

In [7]:
# ----------------------
# Preprocessing
# ----------------------
def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text.split()

data["Rating_binary"] = data["Rating"].apply(lambda x:0 if x <= 3 else 1)
X = data["Review"].apply(tokenize)
y = data["Rating_binary"].values

In [8]:
# ----------------------
# Sentence vector
# ----------------------
def sentence_vector(tokens, model):
    vecs = [model.wv[w] for w in tokens if w in model.wv]
    if not vecs:
        return np.zeros(model.vector_size)
    return np.mean(vecs, axis=0)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=2, sg=1, workers=4, epochs=5)
X_train_vecs = np.array([sentence_vector(r, model) for r in X_train])
X_test_vecs = np.array([sentence_vector(r, model) for r in X_test])

In [28]:
# ----------------------
# Simple sentiment prototypes
# ----------------------
pos_vec = np.mean(X_train_vecs[y_train == 1], axis=0)
neg_vec = np.mean(X_train_vecs[y_train == 0], axis=0)

In [29]:
def cosine(a, b):
    if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
        return 0
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def predict(v):
    return 1 if cosine(v, pos_vec) > cosine(v, neg_vec) else 0

In [32]:
y_pred = np.array([predict(v) for v in X_test_vecs])

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7728714320565991
              precision    recall  f1-score   support

           0       0.55      0.77      0.64      1080
           1       0.90      0.77      0.83      3019

    accuracy                           0.77      4099
   macro avg       0.73      0.77      0.74      4099
weighted avg       0.81      0.77      0.78      4099



In [41]:
def predict_sentence(sentence):
    tokens = tokenize(sentence)
    v = sentence_vector(tokens, model)
    return "positive" if predict(v) == 1 else "negative"

print(predict_sentence("i loved this hotel, i would definitely go here again"))
print(predict_sentence("there were rats in the closet, would not reccommend"))

positive
negative
