In [74]:
import numpy as np
import pandas as pd

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vinay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [75]:
df=pd.read_csv('StarbucksCustomerReviewsData.csv')

In [76]:
df.head()

Unnamed: 0,name,location,Date,Rating,Review,Image_Links
0,Helen,"Wichita Falls, TX","Reviewed Sept. 13, 2023",5.0,Amber and LaDonna at the Starbucks on Southwes...,['No Images']
1,Courtney,"Apopka, FL","Reviewed July 16, 2023",5.0,** at the Starbucks by the fire station on 436...,['No Images']
2,Daynelle,"Cranberry Twp, PA","Reviewed July 5, 2023",5.0,I just wanted to go out of my way to recognize...,['https://media.consumeraffairs.com/files/cach...
3,Taylor,"Seattle, WA","Reviewed May 26, 2023",5.0,Me and my friend were at Starbucks and my card...,['No Images']
4,Tenessa,"Gresham, OR","Reviewed Jan. 22, 2023",5.0,I’m on this kick of drinking 5 cups of warm wa...,['https://media.consumeraffairs.com/files/cach...


In [77]:
df=df[['Review','Rating']]
df.head()

Unnamed: 0,Review,Rating
0,Amber and LaDonna at the Starbucks on Southwes...,5.0
1,** at the Starbucks by the fire station on 436...,5.0
2,I just wanted to go out of my way to recognize...,5.0
3,Me and my friend were at Starbucks and my card...,5.0
4,I’m on this kick of drinking 5 cups of warm wa...,5.0


In [78]:
df.shape

(850, 2)

In [79]:
df.isnull().sum()

Review      0
Rating    145
dtype: int64

In [80]:
df['Rating'].value_counts()

Rating
1.0    451
2.0     99
5.0     83
4.0     39
3.0     33
Name: count, dtype: int64

In [81]:
df['Rating']=df['Rating'].fillna(df['Rating'].mode()[0])

In [82]:
df['Rating'].value_counts()

Rating
1.0    596
2.0     99
5.0     83
4.0     39
3.0     33
Name: count, dtype: int64

In [83]:
# Positive Review 1 and Negative Review is 0
df['Rating']=df['Rating'].apply(lambda x:0 if x==1 else 1)
df['Rating'].value_counts()

Rating
0    596
1    254
Name: count, dtype: int64

In [84]:
def clean_text_column(df, text_column):
    """
    Cleans and preprocesses a text column in a Pandas DataFrame in the following order:
    1. Convert text to lowercase.
    2. Remove URLs.
    3. Remove HTML tags.
    4. Remove numbers.
    5. Remove special characters.
    6. Remove stopwords.
    7. Remove extra spaces.

    Parameters:
    df (pd.DataFrame): DataFrame containing the text data.
    text_column (str): The name of the column containing the text to clean.

    Returns:
    pd.DataFrame: DataFrame with the cleaned text column.
    """

    # 1. Convert text to lowercase
    df[text_column] = df[text_column].apply(lambda x: str(x).lower())

    # 2. Remove URLs
    df[text_column] = df[text_column].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://[\w_-]+(?:/[\w./-]*)?', '', x))

    # 3. Remove HTML tags (convert to string if necessary to avoid warnings)
    df[text_column] = df[text_column].apply(lambda x: BeautifulSoup(str(x), 'html.parser').get_text())

    # 4. Remove numbers
    df[text_column] = df[text_column].apply(lambda x: re.sub(r'\d+', '', x))

    # 5. Remove special characters (keeps alphabets, numbers, and hyphens)
    df[text_column] = df[text_column].apply(lambda x: re.sub('[^a-zA-Z0-9-]+', ' ', x))

    # 6. Remove stopwords
    stop_words = set(stopwords.words('english'))
    df[text_column] = df[text_column].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))

    # 7. Remove extra spaces
    df[text_column] = df[text_column].apply(lambda x: " ".join(x.split()))

    return df

In [85]:
clean_text_column(df,'Review')
df.head()

  df[text_column] = df[text_column].apply(lambda x: BeautifulSoup(str(x), 'html.parser').get_text())


Unnamed: 0,Review,Rating
0,amber ladonna starbucks southwest parkway alwa...,1
1,starbucks fire station altamonte springs fl ma...,1
2,wanted go way recognize starbucks employee bil...,1
3,friend starbucks card work thankful worker pai...,1
4,kick drinking cups warm water work instacart r...,1


In [86]:
lemmatizer=WordNetLemmatizer()

In [87]:
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [88]:
df['Review']=df['Review'].apply(lambda x:lemmatize_words(x))

In [89]:
df.head()

Unnamed: 0,Review,Rating
0,amber ladonna starbucks southwest parkway alwa...,1
1,starbucks fire station altamonte spring fl mad...,1
2,wanted go way recognize starbucks employee bil...,1
3,friend starbucks card work thankful worker pai...,1
4,kick drinking cup warm water work instacart ri...,1


In [90]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df['Review'],df['Rating'],
                                              test_size=0.20)

In [91]:
from sklearn.feature_extraction.text import CountVectorizer
bow=CountVectorizer()
X_train_bow=bow.fit_transform(X_train).toarray()
X_test_bow=bow.transform(X_test).toarray()

In [92]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
X_train_tfidf=tfidf.fit_transform(X_train).toarray()
X_test_tfidf=tfidf.transform(X_test).toarray()

In [93]:
X_train_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [94]:
from sklearn.naive_bayes import GaussianNB
nb_model_bow=GaussianNB().fit(X_train_bow,y_train)
nb_model_tfidf=GaussianNB().fit(X_train_tfidf,y_train)

In [95]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [96]:
y_pred_bow=nb_model_bow.predict(X_test_bow)

In [97]:
y_pred_tfidf=nb_model_bow.predict(X_test_tfidf)

In [98]:
confusion_matrix(y_test,y_pred_bow)

array([[97, 18],
       [37, 18]], dtype=int64)

In [99]:
print("BOW accuracy: ",accuracy_score(y_test,y_pred_bow))

BOW accuracy:  0.6764705882352942


In [100]:
confusion_matrix(y_test,y_pred_tfidf)

array([[96, 19],
       [39, 16]], dtype=int64)

In [101]:
print("TFIDF accuracy: ",accuracy_score(y_test,y_pred_tfidf))

TFIDF accuracy:  0.6588235294117647
