In [1]:
import pandas as pd
import re
import nltk
import pickle

In [3]:
# Importing the dataset from a TSV (Tab-Separated Values) file.
# The dataset contains restaurant reviews.
df_train = pd.read_csv('../../datasets/Restaurant_Reviews.tsv', sep='\t')

# Displaying the first 5 rows of the dataset to get an initial look at the data.
df_train.head()


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
# Importing the list of English stopwords from the NLTK library.
# Stopwords are commonly used words (like "and", "the", "is") that are often removed during text preprocessing
# because they do not carry significant meaning for tasks like sentiment analysis or text classification.
stopwords = nltk.corpus.stopwords.words('english')

In [7]:
# Getting the total number of English stopwords in the NLTK stopwords list.
# This will return the count of words considered as stopwords in the English language.
len(stopwords)

179

In [9]:
# Removing the word 'not' from the stopwords list.
# 'Not' is often important in sentiment analysis as it can negate the sentiment (e.g., "not good").
stopwords.remove('not')
# Removing the word 'but' from the stopwords list.
# 'But' can also carry significant meaning as it indicates contrast or emphasis (e.g., "good but expensive").
stopwords.remove('but')

In [11]:
# Initializing the Porter Stemmer from the NLTK library.
# The Porter Stemmer is used to reduce words to their base or root form (e.g., "loved" -> "love").
stemmer = nltk.stem.PorterStemmer()
stemmer.stem('loved')

'love'

In [13]:
def preprocess(r1):
    # Converting the input text (r1) to lowercase to ensure uniformity.
    r1 = r1.lower()
    # Removing all characters except lowercase alphabets using a regular expression.
    # This replaces non-alphabetic characters with a space.
    r1 = re.sub('[^a-z]', ' ', r1)
    # Tokenizing the text into words, removing stopwords, and applying stemming.
    # 'r1.split()' splits the text into words.
    # The condition 'if word not in stopwords' filters out stopwords.
    # 'stemmer.stem(word)' reduces each word to its stem (base form).
    words = [stemmer.stem(word) for word in r1.split() if word not in stopwords]
    # Joining the processed words back into a single string with spaces between them.
    return ' '.join(words)

In [15]:
preprocess('Wow... Loved this place.')

'wow love place'

In [17]:
# Applying the preprocess function to each review in the 'Review' column of the dataframe.
# The processed reviews are stored in a new Series called 'preprocessed_reviews'.
preprocessed_reviews = df_train['Review'].apply(preprocess)

In [19]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [21]:
# Initializing the CountVectorizer to create a Bag-of-Words (BoW) model.
# The 'ngram_range=(1,2)' parameter specifies that both unigrams (single words) and bigrams (two consecutive words) 
# will be considered as features in the vectorization process.
# Alternatively, you could use TfidfVectorizer() to create a TF-IDF model for feature extraction.
vectorizer = CountVectorizer(ngram_range=(1,2))  # TfidfVectorizer()

# Fitting the vectorizer to the preprocessed reviews and transforming them into a sparse matrix.
# Each row in the matrix corresponds to a review, and each column corresponds to a unique word or bigram.
bow_table = vectorizer.fit_transform(preprocessed_reviews)

# Converting the sparse matrix to a dense array.
# This allows for compatibility with machine learning algorithms that may require dense input.
X_train = bow_table.toarray()

In [23]:
# Getting the shape of the 'X_train' array, which represents the number of rows (reviews) and columns (features/terms).
# The shape will be in the form (number_of_reviews, number_of_features).
X_train.shape

(1000, 5755)

In [25]:
# Assigning the 'Liked' column from the dataframe to the variable 'y_train'.
# This column represents the target labels (e.g., whether the review is positive or negative),
# which will be used for supervised learning during model training.
y_train = df_train['Liked']

In [29]:
# Importing the LogisticRegression class from the scikit-learn library.
# Logistic Regression is a classification algorithm used for binary or multiclass classification tasks.

from sklearn.linear_model import LogisticRegression

# Initializing the Logistic Regression model.
rmodel = LogisticRegression()

# Fitting the model to the training data.
# The model will learn from the input features in 'X_train' and the corresponding target labels in 'y_train'.
# This step trains the model to predict the 'Liked' column based on the Bag-of-Words features of the reviews.
rmodel.fit(X_train, y_train)

In [31]:
# Evaluating the performance of the trained model on the training data.
# The 'score' method calculates the accuracy of the model by comparing the predicted labels to the true labels in 'y_train'.
# It returns the proportion of correctly classified instances in the training dataset.
rmodel.score(X_train, y_train)

0.994

In [33]:
# Importing the necessary metrics from scikit-learn to evaluate the model's performance.
from sklearn.metrics import precision_score, recall_score, f1_score

# Using the trained model to predict the target labels for the training data.
y_pred = rmodel.predict(X_train)

# Calculating the Precision, Recall, and F1 Score for the model's predictions.
# Precision: The proportion of true positive predictions among all positive predictions.
precision = precision_score(y_train, y_pred)

# Recall: The proportion of true positive predictions among all actual positive instances.
recall = recall_score(y_train, y_pred)

# F1 Score: The harmonic mean of Precision and Recall, providing a balanced measure of the model's performance.
f1 = f1_score(y_train, y_pred)

# Printing the calculated metrics
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Precision: 0.9979838709677419
Recall: 0.99
F1 Score: 0.9939759036144579


In [35]:
# Defining a list of sample reviews that will be used for testing the model.
# These reviews contain both positive and negative feedback about food and service.
x = [
    'Food was tasty and service was fast...',
    'The food was very delicious! Loved your service too',
    'Very good test.... I love it....',
    'Waiting time is very high',
    'Unhappy with your service…',
    'Wonderful food! Absolutely amazing.',
    'Authentic dishes, cozy ambiance and excellent service make it must visit',
    'pathetic service but Nice food.',
    'The food and service quality was excellent'
]


In [37]:
# Creating a pandas Series from the list of test reviews 'x'.
# This Series will allow us to apply the preprocessing function to each review individually.
s1 = pd.Series(x)

# Applying the 'preprocess' function to each review in the Series to clean and normalize the text.
# This step ensures that the text is in a consistent format before feature extraction.
s1 = s1.apply(preprocess)

# Transforming the preprocessed test reviews into feature vectors using the same vectorizer used for training data.
# The 'transform' method is used instead of 'fit_transform' since we are applying it to new, unseen data.
# The result is a sparse matrix of features (terms from n-grams).
X_test = vectorizer.transform(s1).toarray()

# Using the trained model to predict the sentiment (liked/disliked) for the test reviews.
# This returns the predicted labels for each review in the test set.
rmodel.predict(X_test)

array([1, 1, 1, 0, 0, 1, 1, 0, 1], dtype=int64)

In [39]:
with open('rmodel.pkl', 'wb') as file:
    pickle.dump(rmodel, file)