In [23]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [24]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [25]:
df = pd.read_csv('/content/reviews_data.csv')

In [26]:
print("Original Data Sample:")
print(df.head())


Original Data Sample:
       name           location                     Date  Rating  \
0     Helen  Wichita Falls, TX  Reviewed Sept. 13, 2023     5.0   
1  Courtney         Apopka, FL   Reviewed July 16, 2023     5.0   
2  Daynelle  Cranberry Twp, PA    Reviewed July 5, 2023     5.0   
3    Taylor        Seattle, WA    Reviewed May 26, 2023     5.0   
4   Tenessa        Gresham, OR   Reviewed Jan. 22, 2023     5.0   

                                              Review  \
0  Amber and LaDonna at the Starbucks on Southwes...   
1  ** at the Starbucks by the fire station on 436...   
2  I just wanted to go out of my way to recognize...   
3  Me and my friend were at Starbucks and my card...   
4  I’m on this kick of drinking 5 cups of warm wa...   

                                         Image_Links  
0                                      ['No Images']  
1                                      ['No Images']  
2  ['https://media.consumeraffairs.com/files/cach...  
3                 

In [27]:
df.shape

(850, 6)

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         850 non-null    object 
 1   location     850 non-null    object 
 2   Date         850 non-null    object 
 3   Rating       705 non-null    float64
 4   Review       850 non-null    object 
 5   Image_Links  850 non-null    object 
dtypes: float64(1), object(5)
memory usage: 40.0+ KB


In [29]:
def lowercase(text):
    return text.lower()

In [30]:
def remove_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

In [31]:
def remove_html(text):
    return re.sub(r'<.*?>', '', text)

In [32]:
def remove_special(text):
    return re.sub(r'[^a-zA-Z0-9\s]', ' ', text)

In [33]:
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

In [34]:
def tokenize(text):
    return text.split()

In [35]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

In [36]:
lemmatizer = WordNetLemmatizer()
def lemmatize(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

In [37]:
def clean_text(text):
    text = lowercase(str(text))
    text = remove_urls(text)
    text = remove_html(text)
    text = remove_special(text)
    text = remove_numbers(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize(tokens)
    return ' '.join(tokens)


In [38]:
df['cleaned_review'] = df['Review'].apply(clean_text)

In [39]:
print("\nCleaned Reviews Sample:")
print(df[['Review','cleaned_review']].head())


Cleaned Reviews Sample:
                                              Review  \
0  Amber and LaDonna at the Starbucks on Southwes...   
1  ** at the Starbucks by the fire station on 436...   
2  I just wanted to go out of my way to recognize...   
3  Me and my friend were at Starbucks and my card...   
4  I’m on this kick of drinking 5 cups of warm wa...   

                                      cleaned_review  
0  amber ladonna starbucks southwest parkway alwa...  
1  starbucks fire station altamonte spring fl mad...  
2  wanted go way recognize starbucks employee bil...  
3  friend starbucks card work thankful worker pai...  
4  kick drinking cup warm water work instacart ri...  


In [40]:
df.to_csv('starbucks_reviews_cleaned.csv', index=False)
print("\nSaved cleaned file.")


Saved cleaned file.
