In [15]:
import pandas as pd

df = pd.read_csv("yelp.csv")
df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [16]:
#only preservve the stars and text columns
df = df[['stars', 'text']]
df.head()

Unnamed: 0,stars,text
0,5,My wife took me here on my birthday for breakf...
1,5,I have no idea why some people give bad review...
2,4,love the gyro plate. Rice is so good and I als...
3,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!..."
4,5,General Manager Scott Petello is a good egg!!!...


In [17]:
#switch the stars to binary (1 for positive, 0 for negative sentiment)
df['stars'] = df['stars'].apply(lambda x: 1 if x >= 4 else 0)
df.head()

Unnamed: 0,stars,text
0,1,My wife took me here on my birthday for breakf...
1,1,I have no idea why some people give bad review...
2,1,love the gyro plate. Rice is so good and I als...
3,1,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!..."
4,1,General Manager Scott Petello is a good egg!!!...


In [18]:
#use separator to split the text into words
df['text'] = df['text'].str.split()
df.head()

Unnamed: 0,stars,text
0,1,"[My, wife, took, me, here, on, my, birthday, f..."
1,1,"[I, have, no, idea, why, some, people, give, b..."
2,1,"[love, the, gyro, plate., Rice, is, so, good, ..."
3,1,"[Rosie,, Dakota,, and, I, LOVE, Chaparral, Dog..."
4,1,"[General, Manager, Scott, Petello, is, a, good..."


In [19]:
#remove stop words in the text
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stop_words])
df.head()

Unnamed: 0,stars,text
0,1,"[My, wife, took, birthday, breakfast, excellen..."
1,1,"[I, idea, people, give, bad, reviews, place., ..."
2,1,"[love, gyro, plate., Rice, good, I, also, dig,..."
3,1,"[Rosie,, Dakota,, I, LOVE, Chaparral, Dog, Par..."
4,1,"[General, Manager, Scott, Petello, good, egg!!..."


In [23]:
# Ensure all rows in 'text' are strings
df['text'] = df['text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
df.head()

Unnamed: 0,stars,text
0,1,My wife took birthday breakfast excellent. The...
1,1,I idea people give bad reviews place. It goes ...
2,1,love gyro plate. Rice good I also dig candy se...
3,1,"Rosie, Dakota, I LOVE Chaparral Dog Park!!! It..."
4,1,General Manager Scott Petello good egg!!! Not ...


In [24]:
#Text mining preprocessing, converting text into vectors, implement tf-idf (sklearn.feature_extraction.text.TfidfVectorizer)

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

#Apply tf-idf to the "Text" column
tfidf_matrix = tfidf.fit_transform(df['text']).toarray()  # Convert sparse to dense
tfidf_matrix


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
#split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, df['stars'], test_size=0.2, random_state=42)
#split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [None]:
#build the CNN model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.layers import Dropout

#add dropout layer to avoid overfitting
model = Sequential()
model.add(Embedding(input_dim=tfidf_matrix.shape[1], output_dim=128, input_length=tfidf_matrix.shape[1]))
model.add(Dropout(0.2))
model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Epoch 1/10

KeyboardInterrupt: 

In [None]:
#show CNN model structure
print(model.summary())

In [None]:
#train the CNN model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

In [None]:
#plot the trainging process for the CNN model and show accuracy and loss
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
#plot the trainging process for the CNN model and show accuracy and loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
#build the LSTM model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=tfidf_matrix.shape[1], output_dim=128, input_length=tfidf_matrix.shape[1]))
model_lstm.add(SpatialDropout1D(0.7))
model_lstm.add(LSTM(64), dropout=0.2, recurrent_dropout=0.2)
model_lstm.add(Dense(1, activation='sigmoid'))

model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model_lstm.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

In [None]:
#show LSTM model structure
print(model_lstm.summary())

In [None]:
#train the LSTM model
history_lstm = model_lstm.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

In [None]:
#plot the trainging process for the LSTM model and show accuracy and loss
import matplotlib.pyplot as plt
plt.plot(history_lstm.history_lstm['accuracy'])
plt.plot(history_lstm.history_lstm['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
#plot the trainging process for the LSTM model and show accuracy and loss
plt.plot(history_lstm.history_lstm['loss'])
plt.plot(history_lstm.history_lstm['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()