# Natural Language Processing - Sentiment analysis of reviews

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

In [3]:
dataset

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


## Cleaning the texts

- This code snippet is performing text preprocessing tasks like removing non-alphabetic characters, converting text to lowercase, tokenizing into individual words, stemming, and removing

In [4]:
import re # This line imports the Python module re, which stands for regular expressions.
import nltk #This line imports the Natural Language Toolkit (NLTK)
nltk.download('stopwords') 
#This line downloads the stopwords corpus from NLTK.
# Stopwords are common words like "is," "the," "and," etc., that are often removed from text data as they do not carry significant meaning.
from nltk.corpus import stopwords
#This line imports the stopwords corpus from NLTK.
from nltk.stem.porter import PorterStemmer
#This line imports the PorterStemmer class from NLTK.
#A stemming algorithm is used to reduce words to their base or root form, which can help in reducing vocabulary size and finding the core meaning of words.

corpus = [] #The purpose of this list is to store processed reviews.
for i in range(0, 1000):
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) #This line substitute any character that is not a lowercase or uppercase letter with a space. It takes the Review column from the dataset and processes the ith review.
  review = review.lower() #This line converts the processed review to lowercase. 
  review = review.split() #This line splits the review into a list of words. 
  ps = PorterStemmer() # It will be used to perform stemming on the words in the review.
  all_stopwords = stopwords.words('english') #This line retrieves the list of English stopwords from NLTK
  all_stopwords.remove('not') #This line removes the word "not" from the all_stopwords list. This is done because "not" can be an important indicator of sentiment in text data and is often relevant for sentiment analysis tasks.
  all_stopwords.remove('no')
  all_stopwords.remove('but')
  all_stopwords.remove("won't")
  review = [ps.stem(word) for word in review if word not in set(all_stopwords)] #This line uses a list comprehension to perform stemming on each word in the review, excluding the words that are present in the all_stopwords list. 
  review = ' '.join(review) #This line joins the list of stemmed words back into a single string, where each word is separated by a space.
  corpus.append(review) #This line adds the processed review to the corpus list. 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pritigupta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
print(corpus)

['wow love place', 'crust not good', 'not tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch', 'servic prompt', 'would not go back', 'cashier no care ever say still end wayyy overpr', 'tri cape cod ravoli chicken cranberri mmmm', 'disgust pretti sure human hair', 'shock no sign indic cash', 'highli recommend', 'waitress littl slow servic', 'place not worth time let alon vega', 'not like', 'burritto blah', 'food amaz', 'servic also cute', 'could care less interior beauti', 'perform', 'right red velvet cake ohhh stuff good', 'never brought salad ask', 'hole wall great mexican street taco friendli staff', 'took hour get food tabl restaur food luke warm sever run around like total overwhelm', 'worst salmon sashimi', 'also combo like burger fri beer decent deal', 'like final blow', 'found place accid cou

## Creating the Bag of Words model

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

# Create an instance of CountVectorizer with maximum features set to 1500
cv = CountVectorizer(max_features=1500)

# Convert the corpus (list of preprocessed reviews) into a matrix of token counts
X = cv.fit_transform(corpus).toarray()

# Extract the target variable from the dataset
y = dataset.iloc[:, -1].values


In [6]:
import joblib
joblib.dump(cv, 'vectorizer_reviews.joblib')

['vectorizer_reviews.joblib']

## Splitting the dataset into the Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Training on different classification models on the Training set

In [9]:
from sklearn.model_selection import  cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score



# Define the classification models with regularization parameters
models = [
    ('Logistic Regression', LogisticRegression(C=1.)),
    ('Naive Bayes', MultinomialNB()),
    ('Support Vector Machine', SVC(C=1.,kernel='rbf')),
    ('Random Forest', RandomForestClassifier())
]


# Check accuracy for different classification models
for model_name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{model_name} accuracy: {accuracy}')

# Perform k-fold cross-validation to measure accuracy for the best model
best_model = models[0][1]
best_accuracy = 0

for model_name, model in models:
    accuracy_scores = cross_val_score(model, X_train, y_train, cv=10)
    mean_accuracy = np.mean(accuracy_scores)
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_model = model_name

print(f'Best model: {best_model}')
print(f'Accuracy with k-fold cross-validation: {best_accuracy}')


Logistic Regression accuracy: 0.78
Naive Bayes accuracy: 0.8
Support Vector Machine accuracy: 0.775
Random Forest accuracy: 0.77
Best model: Logistic Regression
Accuracy with k-fold cross-validation: 0.80375


In [10]:
# choosing logistic regression as the best model

classifier = LogisticRegression(C = 1.0)
classifier.fit(X_train, y_train)

## Predicting if a single review is positive or negative



### Postive review case

Use our model to predict if the following review:

"Had a lovely time."

is positive or negative.

In [11]:
new_review = 'Had a lovely time'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)

if new_y_pred ==0:
    print('negative sentiment')
else:
    print('positive sentiment')

positive sentiment


### Negative review case

Use our model to predict if the following review:

"The food was stale"

is positive or negative.

In [12]:
new_review = 'The food was stale'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)

if new_y_pred ==0:
    print('negative sentiment')
else:
    print('positive sentiment')

negative sentiment


## Saving the model and then loading it back to make a prediction

In [13]:
import joblib

# Save the trained model to a file
joblib.dump(classifier, 'logistic_regression_NLPreviews.joblib')

['logistic_regression_NLPreviews.joblib']

In [1]:
# Load the saved model from a file
import joblib
loaded_model = joblib.load('logistic_regression_NLPreviews.joblib')
loaded_vectorizer = joblib.load('vectorizer_reviews.joblib')

In [2]:
# Make predictions using the loaded model
new_data = loaded_vectorizer.transform(['The food could have been better.'])
predictions = loaded_model.predict(new_data)
if predictions ==0:
    print('negative sentiment')
else:
    print('positive sentiment')


negative sentiment
