## Sentiment Analysis




In [1]:
!python -m spacy download en_core_web_lg
!pip install vaderSentiment

2024-01-29 00:33:45.814679: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-29 00:33:45.814779: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-29 00:33:45.816583: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-29 00:33:45.828463: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-lg==3.6.0
  Downloading https:

# Data import and Libraries

In [9]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [7]:
import pandas as pd
# Reading the data
data_path = '/content/drive/MyDrive/restaurant_reviews_az.csv'
df = pd.read_csv(data_path)

# Importing the first 10000 rows
df = df.head(10000)

# Displaying a summary of the input data
df.head()


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,Sentiment
0,IVS7do_HBzroiCiymNdxDg,fdFgZQQYQJeEAshH4lxSfQ,sGy67CpJctjeCWClWqonjA,3,1,1,0,"OK, the hype about having Hatch chili in your ...",1/27/2020 22:59,1
1,QP2pSzSqpJTMWOCuUuyXkQ,JBLWSXBTKFvJYYiM-FnCOQ,3w7NRntdQ9h0KwDsksIt5Q,5,1,1,1,Pandemic pit stop to have an ice cream.... onl...,4/19/2020 5:33,1
2,oK0cGYStgDOusZKz9B1qug,2_9fKnXChUjC5xArfF8BLg,OMnPtRGmbY8qH_wIILfYKA,5,1,0,0,I was lucky enough to go to the soft opening a...,2/29/2020 19:43,1
3,E_ABvFCNVLbfOgRg3Pv1KQ,9MExTQ76GSKhxSWnTS901g,V9XlikTxq0My4gE8LULsjw,5,0,0,0,I've gone to claim Jumpers all over the US and...,3/14/2020 21:47,1
4,Rd222CrrnXkXukR2iWj69g,LPxuausjvDN88uPr-Q4cQA,CA5BOxKRDPGJgdUQ8OUOpw,4,1,0,0,"If you haven't been to Maynard's kitchen, it'...",1/17/2020 20:32,1


In [10]:
df.shape

(10000, 10)

# Lexicon based approach along with VaderSentiment

In [12]:
import spacy  # Load the Spacy library
nlp = spacy.load("en_core_web_lg", disable=['parser', 'tagger', 'ner'])

# Define a function for text normalization
def normalize_text(review, convert_to_lowercase, exclude_stopwords):
    if convert_to_lowercase:
        review = review.lower()
    doc = nlp(review)
    lemmatized_tokens = [token.lemma_ for token in doc if not exclude_stopwords or (exclude_stopwords and not token.is_stop)]
    return " ".join(lemmatized_tokens)

# Apply the normalization function to the 'text' column and create a new 'processed' column
df['processed'] = df['text'].apply(normalize_text, convert_to_lowercase=True, exclude_stopwords=True)




In [13]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the SentimentIntensityAnalyzer
sentiment_analyzer = SentimentIntensityAnalyzer()

# Apply the sentiment analysis to the 'text' column and create a new 'sentiment_new' column
df['sentiment_new'] = df['text'].apply(lambda review: sentiment_analyzer.polarity_scores(review))


# Preparing the traning and test sets on I/o 1 data for ML classifications

In [14]:
X=df['text']
y=df['Sentiment']

In [15]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing and Bag of Words Vectorization using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

# Tokenize using a regular expression pattern
tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')

# Create a CountVectorizer with specific settings
cv = CountVectorizer(stop_words='english', ngram_range=(1, 1), tokenizer=tokenizer.tokenize, max_features=1000)

# Transform the training data
X_train_vect = cv.fit_transform(X_train)
print(X_train_vect.shape)

# Transform the testing data
X_test_vect = cv.transform(X_test)
print(X_test_vect.shape)




(8000, 1000)
(2000, 1000)


In [16]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()

In [17]:
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(MNB, X_train_vect, y_train, cv=5)

In [19]:
# variables to store term statistics
comments_count = 0
unique_words = set() # Utilizing a set-type variable to track unique words and avoid duplicates
tokens_per_comment_count = [] # Using a list-type variable to compute corpus-level statistics (e.g., average, maximum, minimum, median, etc.)
tokens_per_comment_without_stop_words_count = []
total_tokens_count = 0 # Across the corpus
unique_users = set() # Utilizing a set-type variable to count unique users
dates_list = [] # Tracking the number of comments over time (e.g., by day, week, etc.)
total_vote_count = 0
unique_submissions = set() # Utilizing a set-type variable to count unique submissions

In [20]:
for index, row in df.iterrows():
    text = row["text"]
    doc = nlp(text)
    comments_count += 1

    # Statistics regarding words
    num_of_tokens = len(doc)
    total_tokens_count += num_of_tokens
    token_count_without_stop_words = 0

    for token in doc:
        if token.is_stop is True:
            pass
        else:
            unique_words.add(str(token).lower())
            token_count_without_stop_words += 1

    tokens_per_comment_count.append(num_of_tokens)
    tokens_per_comment_without_stop_words_count.append(token_count_without_stop_words)

    # Statistics regarding users
    user_id = row["user_id"]
    unique_users.add(user_id.lower())

    # # Statistics regarding date
    # date = row["date"]
    # dates_list.append(date)

    # Statistics regarding reviews
    review_id = row["review_id"]
    unique_submissions.add(review_id)




In [21]:
# Describing the Statistics
import numpy as np

# Number of comments
print("Number of comments:", comments_count)

# Number of unique words
print("Number of unique words:", len(unique_words))

# Total number of words in the corpus
print("Total number of words in the corpus:", total_tokens_count)

# Average number of words in comments
print("Average number of words in comments:", np.mean(np.asarray(tokens_per_comment_count)))

# Average number of words in comments without stop words
print("Average number of words in comments without stop words:", np.mean(np.asarray(tokens_per_comment_without_stop_words_count)))

# Maximum number of words in comments
print("Maximum number of words in comments:", np.max(np.asarray(tokens_per_comment_count)))

# Maximum number of words in comments without stop words
print("Maximum number of words in comments without stop words:", np.max(np.asarray(tokens_per_comment_without_stop_words_count)))

# Minimum number of words in comments
print("Minimum number of words in comments:", np.min(np.asarray(tokens_per_comment_count)))

# Minimum number of words in comments without stop words
print("Minimum number of words in comments without stop words:", np.min(np.asarray(tokens_per_comment_without_stop_words_count)))

# Median number of words in comments
print("Median number of words in comments:", np.median(np.asarray(tokens_per_comment_count)))

# Median number of words in comments without stop words
print("Median number of words in comments without stop words:", np.median(np.asarray(tokens_per_comment_without_stop_words_count)))

# Number of unique users
print("Number of unique users:", len(unique_users))

# Number of submissions
print("Number of submissions:", len(unique_submissions))


Number of comments: 10000
Number of unique words: 18907
Total number of words in the corpus: 1011533
Average number of words in comments: 101.1533
Average number of words in comments without stop words: 51.67
Maximum number of words in comments: 1108
Maximum number of words in comments without stop words: 604
Minimum number of words in comments: 4
Minimum number of words in comments without stop words: 4
Median number of words in comments: 72.0
Median number of words in comments without stop words: 37.0
Number of unique users: 6830
Number of submissions: 9875


# Naive Bayes

In [22]:
# Train the classifier and assess its performance on the test set
from sklearn.metrics import classification_report

# Fit the Multinomial Naive Bayes classifier on the training data
MNB.fit(X_train_vect, y_train)

# Predict labels for the test set
predicted_labels = MNB.predict(X_test_vect)

# Evaluate classifier performance and print the classification report
classification_results = classification_report(y_test, predicted_labels, target_names=['0', '1'])
print(classification_results)


              precision    recall  f1-score   support

           0       0.79      0.82      0.81       552
           1       0.93      0.92      0.92      1448

    accuracy                           0.89      2000
   macro avg       0.86      0.87      0.87      2000
weighted avg       0.89      0.89      0.89      2000



# SVM

In [29]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train_vect, y_train)



In [31]:
from sklearn import metrics
predicted = clf.predict(X_test_vect)
performance = metrics.classification_report(y_test,predicted, target_names= ['0', '1'])
print(performance)


              precision    recall  f1-score   support

           0       0.84      0.79      0.81       552
           1       0.92      0.94      0.93      1448

    accuracy                           0.90      2000
   macro avg       0.88      0.87      0.87      2000
weighted avg       0.90      0.90      0.90      2000



# TF-IDF

In [32]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
text_review = df['text']  # store all reviews here so that we can fit the vectorizer with this dataset
Y = df['Sentiment']
print(text_review)
print(Y)

0       OK, the hype about having Hatch chili in your ...
1       Pandemic pit stop to have an ice cream.... onl...
2       I was lucky enough to go to the soft opening a...
3       I've gone to claim Jumpers all over the US and...
4       If you haven't been  to Maynard's kitchen, it'...
                              ...                        
9995    Found this tepanyaki restaurant a few years ag...
9996    This was our first time in Tucson. We decided ...
9997    We had a birthday dinner reservation at 8:00 w...
9998    Took my parents to dinner here for my Dads bir...
9999    Sad place this sbarro at the outlet mall. They...
Name: text, Length: 10000, dtype: object
0       1
1       1
2       1
3       1
4       1
       ..
9995    0
9996    1
9997    0
9998    1
9999    0
Name: Sentiment, Length: 10000, dtype: int64


In [33]:
vectorizer = TfidfVectorizer(max_features=1000) # the value of max_features is dependent on how you design your model
#vectorizer = TfidfVectorizer()
vectorizer.fit(text_review) # fit the vectorizer with the entire dataset
# once we fitted the vectorizer, we use it for converting raw text (review) into vectors
vectorized_text = vectorizer.transform(text_review)

print(vectorized_text.shape)
print(type(vectorized_text))
print(len(Y))
print(len(text_review))

(10000, 1000)
<class 'scipy.sparse._csr.csr_matrix'>
10000
10000


In [34]:
X_train_vect, Y_train = vectorized_text.toarray(), np.asarray(Y) # convert both input (vectorized_review) and output (y) into numpy variables
print(Y_train)

[1 1 1 ... 0 1 0]


#Logistic Regressions

In [35]:
#Training the model
from sklearn.linear_model import LogisticRegression
LG = LogisticRegression()
print(X_train_vect.shape)
LG.fit(X_train_vect, Y_train)

(10000, 1000)


In [36]:
predicted = LG.predict(X_test_vect)
performance = metrics.classification_report(y_test,predicted, target_names= ['0', '1'])
print(performance)

              precision    recall  f1-score   support

           0       0.37      0.63      0.46       552
           1       0.81      0.58      0.68      1448

    accuracy                           0.60      2000
   macro avg       0.59      0.61      0.57      2000
weighted avg       0.68      0.60      0.62      2000



# Comparison with the VaderSentiment

In [38]:
vader_predicted_labels = []

# Iterate through the test set and predict labels using VADER sentiment analysis
for text in X_test:
    sentiment_scores = sentiment.polarity_scores(text)
    if sentiment_scores['compound'] > 0:
        vader_predicted_labels.append(1)
    else:
        vader_predicted_labels.append(0)

# Assess the classification performance using VADER sentiment analysis and print the report
vader_performance = metrics.classification_report(y_test, vader_predicted_labels, target_names=['0', '1'])
print(vader_performance)


NameError: name 'sentiment' is not defined

In [39]:
v_pr in X_test:
  sent= sentiment.polarity_scores(text)
  if sent['compound']>0:
    v_predicted.append(1)
  else:
    v_predicted.append(0)
v_performance = metrics.classification_report(y_test,v_predicted, target_names= ['0', '1'])
print(v_performance)edicted = []
for text

NameError: name 'sentiment' is not defined

In [40]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Create an instance of SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()


In [41]:
v_predicted = []
for text in X_test:
  sent= sentiment.polarity_scores(text)
  if sent['compound']>0:
    v_predicted.append(1)
  else:
    v_predicted.append(0)
v_performance = metrics.classification_report(y_test,v_predicted, target_names= ['0', '1'])
print(v_performance)

              precision    recall  f1-score   support

           0       0.91      0.53      0.67       552
           1       0.85      0.98      0.91      1448

    accuracy                           0.86      2000
   macro avg       0.88      0.75      0.79      2000
weighted avg       0.86      0.86      0.84      2000



Based on the accuracy scores obtained, it is clear that machine learning models like Naive Bayes and Support Vector Machine (SVM) surpass the lexicon-based sentiment analysis approach in terms of accuracy.

The accuracy score for lexicon-based sentiment analysis was 0.86, while the Naive Bayes model achieved a higher accuracy score of 0.89. The SVM model performed even better with an accuracy score of 0.90. This indicates that these machine learning models excel in capturing the subtleties of language and context, resulting in more precise sentiment predictions.

# Logistic resgression on 2nd input

2nd Input:
(1) "Although the service is satisfactory, the location is challenging to locate. The sanitation is subpar, characterized by outdated facilities. Additionally, the taste of the served food is overwhelmingly fishy, making it challenging for us to consume."
(2) "This restaurant is undoubtedly a favorite of mine and my family. I was particularly impressed during my recent visit. The establishment is well-maintained, and the waiting time for food is under 10 minutes. Moreover, the food itself is incredibly delicious!"
(3) "I value the amiable staff. The food is decent, although not exceptional. The service, while not swift, is somewhat acceptable. It's a dependable choice for a regular meal, but it doesn't offer anything extraordinary."

In [43]:
new_data = ["The service is good, but location is hard to find. Sanitation is not very good with old facilities. Food served tasted extremely fishy, making us difficult to even finish it.",
        "The restaurant is definitely one of my favorites and of my family as well. I was especially impressed with my visit a few days ago. The place is clean, and you just need to wait for fewer than 10 minutes to get food served. And of course, the food is absolutely delicious!",
            "I appreciated the friendly staff. The food was good, not amazing. The service was not prompt but almost acceptable. A reliable spot for a regular meal, but nothing extraordinary."]
vectorized_new_data = vectorizer.transform(new_data)
new_x = vectorized_new_data.toarray()

# Get the probability estimates for each class
predict_scores = LG.predict_proba(new_x)
print("Prediction Scores:\n", predict_scores)

Prediction Scores:
 [[0.73202418 0.26797582]
 [0.04818284 0.95181716]
 [0.46949513 0.53050487]]


# Classification of reviews

In [44]:
def classify_sentiments(prediction_scores, pos_threshold, neg_threshold):
    sentiment_labels = []
    for score in prediction_scores:
        if score[1] > pos_threshold:
            sentiment_labels.append("Positive")
        elif score[1] < neg_threshold:
            sentiment_labels.append("Negative")
        else:
            sentiment_labels.append("Neutral")
    return sentiment_labels

# Thresholds for sentiment classification
positive_threshold = 0.7
negative_threshold = 0.3

# Classify sentiments using the defined thresholds
predicted_sentiments = classify_sentiments(predict_scores, positive_threshold, negative_threshold)

# Display the classified sentiments
for i, sentiment_label in enumerate(predicted_sentiments):
    print("Review", i+1, ":", sentiment_label)


Review 1 : Negative
Review 2 : Positive
Review 3 : Neutral


If the likelihood of positive sentiment exceeds the positive threshold, the review will be categorized as positive. Conversely, if the likelihood of negative sentiment falls below the negative threshold, the review will be labeled as negative. Otherwise, it will be designated as neutral.

I took ChatGPT's help to resolve the errors

In [45]:
pip install nbconvert

