In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import string
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
restaurant_data = pd.read_pickle("/content/drive/MyDrive/data/final_data_restaurant_yelp.gzip")
reviews_data = pd.read_pickle("/content/drive/MyDrive/data/final_data_review_yelp.gzip")
user_data = pd.read_pickle("/content/drive/MyDrive/data/final_data_user_yelp.gzip")

In [5]:
business_CA_list = restaurant_data[restaurant_data['state'] == "CA"]['business_id'].tolist()
business_CA = restaurant_data[restaurant_data['state'] == "CA"]
reviews_CA = reviews_data[reviews_data['business_id'].isin(business_CA_list)]

In [6]:
business_CA.head(2)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
85,IDtLPgUrqorrpqSLdfMhZQ,Helena Avenue Bakery,"131 Anacapa St, Ste C",Santa Barbara,CA,93101,34.414445,-119.690672,4.0,389,1,"{'RestaurantsTakeOut': 'True', 'NoiseLevel': '...","Food, Restaurants, Salad, Coffee & Tea, Breakf...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-14:0', '..."
141,SZU9c8V2GuREDN5KgyHFJw,Santa Barbara Shellfish Company,230 Stearns Wharf,Santa Barbara,CA,93101,34.408715,-119.685019,4.0,2404,1,"{'OutdoorSeating': 'True', 'RestaurantsAttire'...","Live/Raw Food, Restaurants, Seafood, Beer Bar,...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-21:0', ..."


In [7]:
reviews_CA.head(2)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
9,pUycOfUwM8vqX7KjRRhUEA,59MxRhNVhU9MYndMkz0wtw,gebiRewfieSdtt17PTW6Zg,3,0,0,0,Had a party of 6 here for hibachi. Our waitres...,2016-07-25 07:31:06
23,eCiWBf1CJ0Zdv1uVarEhhw,OhECKhQEexFypOMY6kypRw,vC2qm1y3Au5czBtbhc-DNw,4,0,0,0,"Yes, this is the only sushi place in town. How...",2013-09-04 03:48:20


In [8]:
train_df, test_df = train_test_split(reviews_CA, train_size=0.8)

In [9]:
len(train_df)

169398

In [10]:
def text_preprocess(review):
    punctuation = set(string.punctuation)
    stemmer = PorterStemmer()
    stop_W = stopwords.words("english")
    r = ''.join([c for c in review.lower() if (not c in punctuation)])
    word_list = []
    for w in r.split():
        w = stemmer.stem(w)
        if (w not in stop_W) and w.isalpha():
            word_list.append(w)
    return word_list

In [11]:
review_list = []
star_list = []
for _, i in train_df.iterrows():
    review_list.append(text_preprocess(i['text']))
    star_list.append(i['stars'])

print(len(review_list))

169398


In [12]:
star_list = []
for _, i in train_df.iterrows():
    star_list.append(i['stars'])

print(star_list[:5])

[2, 5, 3, 5, 3]


In [13]:
print(len(review_list[0]))
print(review_list[0])

85
['unfortun', 'friend', 'good', 'experi', 'long', 'weekend', 'high', 'hope', 'open', 'wa', 'line', 'say', 'outsid', 'patio', 'dog', 'good', 'expens', 'get', 'food', 'small', 'term', 'portion', 'realli', 'got', 'us', 'wa', 'one', 'first', 'tabl', 'get', 'order', 'meal', 'came', 'minut', 'anoth', 'roll', 'minut', 'later', 'first', 'meal', 'done', 'poor', 'last', 'friend', 'didnt', 'get', 'food', 'wait', 'anoth', 'minut', 'minut', 'sinc', 'order', 'roll', 'noth', 'waiter', 'didnt', 'seem', 'know', 'roll', 'wa', 'never', 'came', 'back', 'tabl', 'ask', 'check', 'order', 'manag', 'came', 'apolog', 'said', 'wait', 'go', 'mcdonald', 'next', 'door', 'paid', 'meal', 'meal', 'wa', 'mia', 'terribl', 'servic', 'price']


In [14]:
model = Word2Vec(review_list,
                 min_count=5,             # Words/items with fewer instances are discarded
                 vector_size=10,          # Model dimensionality
                 window=3,                # Window size
                 sg=1,
                 workers=6,
                 epochs=5,
                 compute_loss=True)       # Skip-gram model

In [15]:
x = model.wv[review_list[0]]
print(x)
print(x.shape)

[[ 1.31697997e-01 -5.22153795e-01  7.48979509e-01  2.52484947e-01
   3.50085258e-01 -2.60772228e-01  8.62513363e-01  2.77909070e-01
  -3.49543661e-01 -3.03441048e-01]
 [-1.70440674e-01 -3.12092096e-01  5.70476890e-01 -6.68464839e-01
   8.31451893e-01  5.35831213e-01  7.62406290e-01  3.62655759e-01
  -5.37964404e-01 -1.61028519e-01]
 [ 2.33327240e-01 -5.55725217e-01  3.92843872e-01 -3.93013775e-01
   1.63206145e-01 -6.31449342e-01  8.69699597e-01  8.15394700e-01
  -6.00811779e-01  1.18356552e-02]
 [-5.13341576e-02 -7.55101562e-01  8.11546028e-01  4.10146356e-01
   2.84268647e-01 -2.28736207e-01  1.54044640e+00 -2.21156090e-01
  -6.84725583e-01  2.75308043e-01]
 [-2.74378628e-01 -3.07352066e-01  4.75385696e-01  3.92901421e-01
   9.71293867e-01  3.30812484e-02  1.24129236e+00  8.41333091e-01
  -1.66732952e-01 -7.37943590e-01]
 [-3.42767805e-01  1.03433192e-01  7.47184098e-01 -4.78618562e-01
   1.08440614e+00  3.46898228e-01  1.80640221e+00 -4.35633212e-02
  -1.43320546e-01 -3.96107703e-01

In [38]:
# compresses the 47 rows (1 for each word in the review) to a single row vector. Number of columns indicate
# the number of features each row has
x.mean(axis=0)

array([ 0.38967112, -0.23730126,  0.37253663,  0.1108247 ,  0.33165058,
        0.11702972,  0.88914424,  0.5227909 , -0.3996278 , -0.34039146],
      dtype=float32)

In [52]:
# NOT WORKING
# X = [model.wv[review].mean(axis=0) for review in review_list]
# X = [np.mean([model.wv[word] for word in review if word in model.wv], axis=0) for review in review_list]
# Y = star_list

# print(X[:3])
# print(len(X))
# print(len(Y))

[array([ 0.38967112, -0.23730126,  0.37253663,  0.1108247 ,  0.33165058,
        0.11702972,  0.88914424,  0.5227909 , -0.3996278 , -0.34039146],
      dtype=float32), array([ 0.5674972 , -0.29906848,  0.54576904,  0.19047713,  0.32916245,
       -0.1254886 ,  0.823287  ,  0.2171976 , -0.51600915, -0.48377493],
      dtype=float32), array([ 0.4741739 , -0.29110128,  0.5457828 ,  0.06557795,  0.22898719,
        0.0981546 ,  0.44594666,  0.77935565, -0.5303215 , -0.46374884],
      dtype=float32)]
169368
169398


In [18]:
X = []
Y = []

for i in range(len(review_list)):
    if any(word in model.wv for word in review_list[i]):
        X.append(np.mean([model.wv[word] for word in review_list[i] if word in model.wv], axis=0))
        Y.append(star_list[i])

print(len(X))
print(len(Y))

169366
169366


In [19]:
X = np.array(X)
Y = np.array(Y)
print(X.shape)
print(Y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

(169366, 10)
(169366,)


### Linear regression

In [58]:
# Train linear regression model
regressor = LinearRegression()
regressor.fit(X_train_scaled, y_train)

# Make predictions
predictions = regressor.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 1.0050305838202302


In [68]:
print(f'Mean Squared Error: {mean_absolute_error(y_test, predictions)}')

Mean Squared Error: 0.8075148890937255


In [69]:
print(f'Mean Squared Error after rounding: {mean_squared_error(y_test, np.round(predictions))}')

Mean Squared Error after rounding: 1.0904233335301412


In [66]:
print(predictions)
print(y_test)

[3.6879315 5.076661  3.975831  ... 4.092809  4.6691947 3.849443 ]
[5 3 4 ... 5 5 5]


### Logistic regression

In [24]:
# Train logistic regression model
regressor = LogisticRegression()
regressor.fit(X_train_scaled, y_train)

# Make predictions
predictions = regressor.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5890063175296688


In [26]:
X_train_scaled.shape

(135492, 10)

### ANN

In [38]:
from keras.models import Sequential
from keras.layers import Dense

classifier = Sequential()

classifier.add(Dense(units = 16, activation = 'relu', input_dim = 10))
classifier.add(Dense(units = 8, activation = 'relu'))
classifier.add(Dense(units = 5, activation = 'softmax'))

In [39]:
classifier.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])

In [42]:
y_train = [i-1 for i in y_train]
y_test = [i-1 for i in y_test]
print(y_train)

[0, 1, 4, 2, 4, 0, 4, 1, 3, 3, 2, 4, 3, 0, 4, 3, 4, 2, 2, 3, 0, 4, 3, 1, 4, 3, 0, 4, 2, 4, 3, 3, 3, 4, 4, 4, 4, 0, 1, 4, 3, 4, 3, 4, 1, 2, 4, 0, 4, 3, 3, 4, 0, 0, 4, 4, 1, 1, 3, 4, 4, 2, 4, 3, 4, 4, 4, 0, 3, 3, 2, 4, 1, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 2, 4, 4, 4, 0, 3, 4, 2, 4, 4, 3, 4, 0, 1, 4, 4, 3, 4, 4, 4, 4, 4, 1, 4, 4, 4, 2, 4, 4, 3, 0, 2, 0, 2, 1, 4, 1, 4, 4, 4, 3, 1, 4, 3, 4, 4, 0, 0, 4, 1, 4, 4, 4, 3, 0, 2, 4, 4, 4, 3, 4, 4, 4, 3, 4, 2, 4, 4, 2, 4, 4, 4, 4, 1, 4, 2, 4, 1, 3, 1, 3, 0, 0, 3, 3, 3, 4, 4, 4, 1, 4, 3, 2, 3, 3, 4, 3, 1, 0, 3, 4, 3, 3, 2, 4, 0, 4, 0, 3, 4, 4, 1, 3, 1, 4, 4, 4, 4, 0, 3, 3, 0, 0, 4, 0, 4, 2, 4, 4, 3, 4, 4, 0, 1, 4, 4, 3, 4, 4, 1, 3, 0, 3, 4, 1, 4, 3, 4, 0, 4, 3, 4, 3, 4, 1, 3, 4, 4, 4, 4, 4, 0, 4, 4, 2, 4, 3, 4, 2, 4, 1, 4, 3, 3, 4, 4, 4, 4, 2, 0, 3, 4, 1, 4, 3, 4, 1, 3, 4, 3, 4, 3, 2, 4, 4, 2, 2, 0, 4, 3, 4, 4, 1, 0, 3, 2, 2, 3, 4, 4, 1, 0, 4, 0, 4, 4, 2, 0, 4, 2, 3, 2, 3, 0, 3, 4, 4, 4, 4, 3, 3, 4, 1, 3, 0, 3, 4, 2, 0, 4, 4, 3, 0, 4, 3, 3, 4, 4, 1, 0, 

In [44]:
print(type(y_train))

<class 'list'>


In [47]:
y_train = np.array(y_train)
y_test = np.array(y_test)
print(type(y_train))

<class 'numpy.ndarray'>


In [48]:
classifier.fit(X_train_scaled, y_train, batch_size = 25, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x794a9aea8f10>

In [52]:
y_pred_prob = classifier.predict(X_test_scaled)

y_pred_classes = y_pred_prob.argmax(axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_classes)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5963275668654425


In [55]:
y_pred = [i+1 for i in y_pred]
y_test = [i+1 for i in y_test]

In [56]:
print(y_pred)
print(y_test)

[4, 5, 5, 5, 5, 4, 1, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 5, 1, 5, 5, 1, 1, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 4, 5, 2, 4, 5, 1, 4, 5, 4, 5, 4, 5, 4, 5, 5, 4, 5, 1, 5, 5, 5, 5, 5, 4, 4, 4, 3, 5, 5, 4, 5, 5, 5, 5, 5, 5, 2, 5, 3, 1, 5, 5, 5, 4, 5, 3, 1, 5, 5, 1, 5, 5, 1, 5, 4, 4, 5, 5, 5, 5, 1, 5, 5, 5, 1, 5, 3, 3, 1, 5, 4, 4, 1, 5, 2, 1, 5, 5, 5, 4, 5, 5, 5, 1, 4, 3, 5, 5, 2, 1, 5, 5, 5, 5, 2, 4, 5, 5, 4, 5, 5, 1, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 2, 5, 5, 1, 5, 5, 5, 4, 5, 1, 5, 5, 4, 5, 1, 4, 3, 5, 5, 4, 1, 5, 5, 5, 2, 5, 5, 4, 5, 5, 5, 5, 5, 5, 1, 2, 5, 4, 5, 5, 1, 5, 5, 5, 1, 5, 5, 5, 5, 4, 1, 5, 3, 5, 5, 5, 2, 2, 5, 5, 5, 5, 5, 4, 5, 3, 5, 5, 5, 5, 5, 5, 4, 5, 5, 4, 5, 5, 5, 3, 5, 2, 5, 1, 5, 5, 4, 5, 1, 5, 5, 5, 1, 1, 5, 5, 1, 5, 5, 5, 2, 5, 5, 5, 5, 1, 3, 5, 5, 3, 1, 5, 5, 5, 5, 4, 5, 5, 1, 1, 5, 1, 5, 4, 5, 4, 5, 4, 5, 4, 5, 1, 4, 5, 5, 5, 4, 2, 5, 5, 5, 2, 3, 5, 5, 5, 5, 5, 5, 5, 4, 2, 3, 2, 5, 4, 4, 5, 5, 1, 5, 5, 5, 4, 5, 5, 4, 5, 2, 5, 