Name: Vamshi Doddi
SRN: PES1PG23CA042

In [1]:
import pandas as pd

# Lexicon-Based and Machine Learning-Based Sentiment Analysis

<h3>Preprocessing and Lexicon-Based Sentiment Analysis</h3>

In [23]:
# 1.	Load the restaurant review dataset and explore the data:
df = pd.read_csv("C:/Users/vamsh/Downloads/a1_RestaurantReviews_HistoricDump.tsv", delimiter = "\t")

In [24]:
df.head(), df.shape

(                                              Review  Liked
 0                           Wow... Loved this place.      1
 1                                 Crust is not good.      0
 2          Not tasty and the texture was just nasty.      0
 3  Stopped by during the late May bank holiday of...      1
 4  The selection on the menu was great and so wer...      1,
 (900, 2))

In [25]:
# 	Count the number of reviews for each sentiment category (0 for negative and 1 for positive).
print("Reviews: ", df['Liked'].value_counts())

Reviews:  Liked
1    496
0    404
Name: count, dtype: int64


In [26]:
# 	Check for missing values in the dataset and handle them appropriately. 
df.isna().sum()

Review    0
Liked     0
dtype: int64

In [27]:
# No null values so no treatment needed 

In [28]:
# 2.	Implement a preprocessing function to clean the restaurant reviews:
# 	Remove stop words and punctuation

In [29]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vamsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
stop_words = set(stopwords.words('english'))
import string
punctuations = string.punctuation

In [34]:
import re
def clean_text(text):
    data = re.sub(f"[{re.escape(punctuations)}]", "", text) #removing punctuations
    words = text.split()
    filtered_words = [word.lower() for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)
    
    

In [35]:
df['cleaned_review'] = df['Review'].apply(clean_text)

In [36]:
df.head()

Unnamed: 0,Review,Liked,cleaned_review
0,Wow... Loved this place.,1,wow... loved place.
1,Crust is not good.,0,crust good.
2,Not tasty and the texture was just nasty.,0,tasty texture nasty.
3,Stopped by during the late May bank holiday of...,1,stopped late may bank holiday rick steve recom...
4,The selection on the menu was great and so wer...,1,selection menu great prices.


In [37]:
# 	Apply lemmatization to the text.Use this function to preprocess the reviews and store the cleaned text in a new column 
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vamsh\AppData\Roaming\nltk_data...


In [42]:
def apply_lemms(text):
    words = text.split()
    lemmafied_words = [lemmatizer.lemmatize(word) for word in words ]
    return " ".join(lemmafied_words)

In [43]:
df["lemmatized_reviews"] = df["cleaned_review"].apply(apply_lemms)

In [44]:
df.head()

Unnamed: 0,Review,Liked,cleaned_review,lemmatized_reviews
0,Wow... Loved this place.,1,wow... loved place.,wow... loved place.
1,Crust is not good.,0,crust good.,crust good.
2,Not tasty and the texture was just nasty.,0,tasty texture nasty.,tasty texture nasty.
3,Stopped by during the late May bank holiday of...,1,stopped late may bank holiday rick steve recom...,stopped late may bank holiday rick steve recom...
4,The selection on the menu was great and so wer...,1,selection menu great prices.,selection menu great prices.


In [45]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
# Initialize VADER SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Step 3: Compute sentiment scores and add them as new columns
def get_sentiment_scores(text):
    scores = sia.polarity_scores(text)
    return scores['neg'], scores['neu'], scores['pos'], scores['compound']

df[['negative_score', 'neutral_score', 'positive_score', 'compound_score']] = df['Review'].apply(get_sentiment_scores).apply(pd.Series)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\vamsh\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [46]:
# Step 4: Classify reviews into sentiment categories based on compound score
def classify_sentiment(compound):
    if compound >= 0.7:
        return "Very Positive"
    elif 0.3 <= compound < 0.7:
        return "Positive"
    elif -0.3 < compound < 0.3:
        return "Neutral"
    elif -0.7 <= compound <= -0.3:
        return "Negative"
    else:
        return "Very Negative"


In [47]:

df['sentiment_category'] = df['compound_score'].apply(classify_sentiment)


In [48]:
# Step 5: Calculate the percentage of reviews in each sentiment category
sentiment_percentage = df['sentiment_category'].value_counts(normalize=True) * 100


In [49]:
# Step 6: Compare sentiment for a custom review
custom_review = "The service was terrible, but the food was amazing!"
custom_scores = sia.polarity_scores(custom_review)
custom_sentiment_category = classify_sentiment(custom_scores['compound'])


In [50]:
# Display the DataFrame, sentiment percentage, and custom review results
print("DataFrame with Sentiment Scores and Categories:")
print(df)
print("\nPercentage of Reviews in Each Sentiment Category:")
print(sentiment_percentage)
print(f"\nCustom Review Analysis:\nReview: {custom_review}\nScores: {custom_scores}\nCategory: {custom_sentiment_category}")

DataFrame with Sentiment Scores and Categories:
                                                Review  Liked  \
0                             Wow... Loved this place.      1   
1                                   Crust is not good.      0   
2            Not tasty and the texture was just nasty.      0   
3    Stopped by during the late May bank holiday of...      1   
4    The selection on the menu was great and so wer...      1   
..                                                 ...    ...   
895  I want to first say our server was great and w...      1   
896                     The pizza selections are good.      1   
897              I had strawberry tea, which was good.      1   
898  Highly unprofessional and rude to a loyal patron!      0   
899                       Overall, a great experience.      1   

                                        cleaned_review  \
0                                  wow... loved place.   
1                                          crust good. 

<h3>Machine Learning-Based Sentiment Analysis</h3>

In [52]:
import spacy
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [53]:
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

In [65]:
# Step 1: Data Loading and Exploration
# Load the dataset
file_path = "C:/Users/vamsh/Downloads/a1_RestaurantReviews_HistoricDump.tsv"
df1 = pd.read_csv(file_path, delimiter = "\t" )

# Display the first five rows
print("First 5 rows of the dataset:")
print(df1.head())

First 5 rows of the dataset:
                                              Review  Liked
0                           Wow... Loved this place.      1
1                                 Crust is not good.      0
2          Not tasty and the texture was just nasty.      0
3  Stopped by during the late May bank holiday of...      1
4  The selection on the menu was great and so wer...      1


In [66]:
# Count the number of positive and negative sentiment labels
sentiment_counts = df1['Liked'].value_counts()
print("\nSentiment Counts:")
print(sentiment_counts)


Sentiment Counts:
Liked
1    496
0    404
Name: count, dtype: int64


In [67]:

# Step 2: Handling Missing Values
# Check for missing values
missing_values = df1.isnull().sum()
print("\nMissing Values:")
print(missing_values)



Missing Values:
Review    0
Liked     0
dtype: int64


In [68]:
# Step 3: Text Preprocessing
stop_words = set(stopwords.words('english'))
punctuation = string.punctuation



In [69]:
# Preprocessing function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(f"[{re.escape(punctuation)}]", "", text)
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    # Lemmatization using spaCy
    doc = nlp(text)
    lemmatized_words = [token.lemma_ for token in doc if token.text not in stop_words and not token.is_punct]
    return " ".join(lemmatized_words)

In [70]:
# Apply preprocessing to the Review column
df1['preprocessed'] = df1['Review'].apply(preprocess_text)

In [71]:
# Display the first few rows of the dataset with preprocessed text
print("\nDataset with Preprocessed Text:")
print(df1.head())


Dataset with Preprocessed Text:
                                              Review  Liked  \
0                           Wow... Loved this place.      1   
1                                 Crust is not good.      0   
2          Not tasty and the texture was just nasty.      0   
3  Stopped by during the late May bank holiday of...      1   
4  The selection on the menu was great and so wer...      1   

                                        preprocessed  
0                                     wow love place  
1                                         crust good  
2                                tasty texture nasty  
3  stop late may bank holiday rick steve recommen...  
4                         selection menu great price  


In [82]:
# Step 4: Convert Text to Numerical Feature Vectors using TfidfVectorizer
X = df1['preprocessed']
y = df1['Liked']

# Step 5: Machine Learning Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Convert text to TF-IDF features
    ('svc', SVC(kernel='linear', random_state=42))  # Use SVC with a linear kernel
])

In [83]:

# Step 6: Model Training
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [84]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

In [85]:
# Step 7: Prediction and Evaluation
# Predict sentiment labels on the training dataset
y_train_pred = pipeline.predict(X_train)

In [86]:
# Predict sentiment labels on the testing dataset
y_test_pred = pipeline.predict(X_test)

In [87]:
# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)


In [88]:
# Print accuracy 
print("\nTraining Accuracy:", train_accuracy * 100,"%")
print("Testing Accuracy:", test_accuracy * 100 , "%")


Training Accuracy: 97.22222222222221 %
Testing Accuracy: 77.22222222222223 %


In [89]:
#Pront classification report
print("Classification Report on Test Data:")
print(classification_report(y_test, y_test_pred))

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.74      0.78      0.76        82
           1       0.81      0.77      0.79        98

    accuracy                           0.77       180
   macro avg       0.77      0.77      0.77       180
weighted avg       0.77      0.77      0.77       180

