In [41]:
import pandas as pd
import nltk
import spacy
import numpy as np

In [42]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from gensim.models import Word2Vec

In [43]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [44]:
csv_file_path = "Restaurant_Reviews.csv"
df = pd.read_csv(csv_file_path)

In [45]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,Yes
1,Crust is not good.,No
2,Not tasty and the texture was just nasty.,No
3,Stopped by during the late May bank holiday of...,Yes
4,The selection on the menu was great and so wer...,Yes


In [46]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mina\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Mina\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [47]:
def tokenize_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    stop_words.discard("not")
    return [word for word in tokens if word.isalnum() and word not in stop_words]

In [48]:
df["Review"] = df["Review"].apply(tokenize_text)
df.head()

Unnamed: 0,Review,Liked
0,"[wow, loved, place]",Yes
1,"[crust, not, good]",No
2,"[not, tasty, texture, nasty]",No
3,"[stopped, late, may, bank, holiday, rick, stev...",Yes
4,"[selection, menu, great, prices]",Yes


In [49]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [50]:
def lemmatize_dataframe(df):
    # Load spaCy model
    nlp = spacy.load("en_core_web_sm")

    # Initialize an empty list to store lemmatized words
    lemmatized_words = []

    # Process each row of the DataFrame
    for row in df["Review"]:
        # Initialize an empty list to store lemmatized words for the current row
        lemmatized_row = []
        # Concatenate all words from the current row into a single sentence
        sentence = ' '.join(row)
        # Process the sentence using spaCy
        doc = nlp(sentence)
        # Iterate over tokens and append lemmatized words to the list
        for token in doc:
            lemmatized_row.append(token.lemma_)
        # Append lemmatized words for the current row to the list of lemmatized words
        lemmatized_words.append(lemmatized_row)

    # Store lemmatized words back into the DataFrame
    df['Review'] = lemmatized_words

In [51]:
lemmatize_dataframe(df)

In [52]:
df.head()

Unnamed: 0,Review,Liked
0,"[wow, love, place]",Yes
1,"[crust, not, good]",No
2,"[not, tasty, texture, nasty]",No
3,"[stop, late, may, bank, holiday, rick, steve, ...",Yes
4,"[selection, menu, great, price]",Yes


In [53]:
# hena hanrag3 el tokens tani eli et3amalaha separate fel preprocessing l string kamel tani
df['Processed_Review'] = df['Review'].apply(lambda x: ' '.join(x))  # x tokens concatenate to string with spaces in between

In [54]:
# Ha3ml initialize lel vectorizer eli bi convert el text documents into a matrix of token counts.
vectorizer = CountVectorizer()

In [55]:
# fit_transform btdrs el vocabulary men df['Processed_Review']
# w bt7awelha le matrix: kol row = document, kol column = kelma. El value = 3dad zhour el kelma fel document.
X_bow = vectorizer.fit_transform(df['Processed_Review'])

In [56]:
# Word Embedding with Word2Vec

# Create a list of token lists for Word2Vec training
token_lists = df['Review'].tolist()

In [57]:
# vector_size = The dimensionality of the word vectors.
# window = Bnaftared 3adad el kalmat eli mawgoda fel sentence 3ashn tb2a dem el context bta3na
# min_count = 2a2al 3adad marra eli kelma mawgoda fel kelma 3ashn t3mlha vector
# workers = 3ashn y3ml parallel processing fel training (faster training time)
model_w2v = Word2Vec(sentences=token_lists, vector_size=100, window=5, min_count=1, workers=4)

In [58]:
# Function 3ashan a3ml average vector lel document bta3t el words eli mawgoda
def document_vector(word_list, model):
    # remove out-of-vocabulary words
    word_list = [word for word in word_list if word in model.wv.index_to_key]
    if len(word_list) == 0:
        # harag3 vector mn 0 3ashn mafish kelma sa7 mawgoda fel model
        return np.zeros(model.vector_size)
    else:
        # harag3 el average vector bta3t el words eli mawgoda fel word_list
        return np.mean(model.wv[word_list], axis=0)


In [59]:
# Apply the function to each row of the DataFrame
df['Document_Vector_filter'] = df['Review'].apply(lambda x: document_vector(x, model_w2v))

In [60]:
# Convert the list of vectors into a 2D array
X_w2v = np.array(df['Document_Vector_filter'].tolist())

In [61]:
# Split data into train and test sets
X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_bow, df['Liked'], test_size=0.2, random_state=48)

In [62]:
# Train SVM classifier with Bag-of-Words features
svm_bow = SVC(kernel='linear')
svm_bow.fit(X_train_bow, y_train)

SVC(kernel='linear')

In [63]:
# Make predictions on test data
y_pred_bow = svm_bow.predict(X_test_bow)
accuracy_bow = accuracy_score(y_test, y_pred_bow)

In [64]:
# el accuracy for the Bag Word
print('\n')
print("Bag Of Words Accuracy: ", accuracy_bow * 100, "%")
print("Bag Of Words Classification Report: ")
print(classification_report(y_test, y_pred_bow))



Bag Of Words Accuracy:  93.69369369369369 %
Bag Of Words Classification Report: 
              precision    recall  f1-score   support

          No       0.92      0.96      0.94       224
         Yes       0.95      0.92      0.94       220

    accuracy                           0.94       444
   macro avg       0.94      0.94      0.94       444
weighted avg       0.94      0.94      0.94       444



In [65]:
# Apply the SVM  on W2V "Word 2 vectors "

# Train Logistic Regression classifier with Word2Vec embeddings
logreg_w2v = LogisticRegression(max_iter=500)
logreg_w2v.fit(X_train_bow, y_train)

y_pred_logreg_w2v = logreg_w2v.predict(X_test_bow)
accuracy_logreg_w2v = accuracy_score(y_test, y_pred_logreg_w2v)

In [66]:
print('\n')
print("Logistic Regression with Word2Vec Accuracy: ", accuracy_logreg_w2v * 100, "%")
print("Logistic Regression with Word2Vec Classification Report: ")
print(classification_report(y_test, y_pred_logreg_w2v))



Logistic Regression with Word2Vec Accuracy:  91.66666666666666 %
Logistic Regression with Word2Vec Classification Report: 
              precision    recall  f1-score   support

          No       0.90      0.94      0.92       224
         Yes       0.93      0.90      0.91       220

    accuracy                           0.92       444
   macro avg       0.92      0.92      0.92       444
weighted avg       0.92      0.92      0.92       444



In [67]:
# initialize NLTK sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

In [68]:
def get_sentiment(text):

    scores = analyzer.polarity_scores(text)

    sentiment = 1 if scores['pos'] > 0 else 0

    return sentiment

In [69]:
# apply get_sentiment function
df['scores'] = df['Processed_Review'].apply(analyzer.polarity_scores)
df['sentiment'] = df['Processed_Review'].apply(get_sentiment)
df['Liked'] = df['Liked'].map({'Yes': 1, 'No': 0})

In [70]:
print('\n')
print("Confusion Matirx : ")
print(confusion_matrix(df['Liked'], df['sentiment']))



Confusion Matirx : 
[[906 213]
 [245 856]]


In [71]:
print('\n')
print("Classification Report : ")
print(classification_report(df['Liked'], df['sentiment']))



Classification Report : 
              precision    recall  f1-score   support

           0       0.79      0.81      0.80      1119
           1       0.80      0.78      0.79      1101

    accuracy                           0.79      2220
   macro avg       0.79      0.79      0.79      2220
weighted avg       0.79      0.79      0.79      2220

