In [2]:
# import needed libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oviya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\oviya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# base url for CoffeeReview website, specifically the reviews
base_url = "https://www.coffeereview.com/reviews/page/{}/"

# list to store scraped data
data = []

# loop through 350 pages of the reviews
for page in range(1, 351):
    # format the url for each page, then send a GET request to that url
    url = base_url.format(page)
    try:
        response = requests.get(url)
    except:
        continue
    
    # error checking
    if response.status_code != 200:
        print(f"Failed to access page {page}")
        continue
    
    # make a BeautifulSoup object that holds all of the parsed HTML
    soup = BeautifulSoup(response.text, "html.parser")
    
    # find all coffee review links on the page
    coffee_links = [a["href"] for a in soup.select(".review-title a")]
    
    # loop through each review link
    for coffee_url in coffee_links:
        # send a GET request to the review link and make a BeautifulSoup object that holds all of the parsed HTML
        try:
            coffee_page = requests.get(coffee_url)
        except:
            continue
        coffee_soup = BeautifulSoup(coffee_page.text, "html.parser")
        
        try:
            name = coffee_soup.find("h1", class_="review-title").text.strip()
            rating = coffee_soup.find("span", class_="review-template-rating").text.strip()
            # extra step needed 
            origin_tag = coffee_soup.find("td", string="Coffee Origin:")
            origin = origin_tag.find_next_sibling("td").text.strip()

            roast_level_tag = coffee_soup.find("td", string="Roast Level:")
            roast_level = roast_level_tag.find_next_sibling("td").text.strip()

            aroma_tag = coffee_soup.find("td", string="Aroma:")
            aroma = aroma_tag.find_next_sibling("td").text.strip()
            acid_struct_tag = coffee_soup.find("td", string="Acidity/Structure:")
            acid_struct = acid_struct_tag.find_next_sibling("td").text.strip()
            body_tag = coffee_soup.find("td", string="Body:")
            body = body_tag.find_next_sibling("td").text.strip()
            flavor_tag = coffee_soup.find("td", string="Flavor:")
            flavor = flavor_tag.find_next_sibling("td").text.strip()
            aftertaste_tag = coffee_soup.find("td", string="Aftertaste:")
            aftertaste = aftertaste_tag.find_next_sibling("td").text.strip()

            review_tag = coffee_soup.find("h2", string="Bottom Line")
            review_text = review_tag.find_next_sibling("p").text.strip()

            # Append to data list
            data.append({
                "Name": name,
                "Rating": int(rating),
                "Origin": origin,
                "Roast Level": roast_level,
                "Aroma": int(aroma),
                "Acidity/Structure": int(acid_struct),
                "Body": int(body),
                "Flavor": int(flavor),
                "Aftertaste": int(aftertaste),
                "Review": review_text
            })

        except AttributeError:
            continue
    
    print(f"Finished scraping page {page}, now have {len(data)} rows in dataset")

# convert to DataFrame and save to a csv file
df = pd.DataFrame(data)
df.to_csv("coffee_reviews.csv", index=False)
print("Scraping completed, DataFrame created, and saved to coffee_reviews.csv")

Finished scraping page 8, now have 17 rows in dataset
Finished scraping page 9, now have 37 rows in dataset
Finished scraping page 10, now have 55 rows in dataset
Finished scraping page 11, now have 72 rows in dataset
Finished scraping page 12, now have 89 rows in dataset
Finished scraping page 13, now have 108 rows in dataset
Finished scraping page 14, now have 128 rows in dataset
Finished scraping page 15, now have 148 rows in dataset
Finished scraping page 16, now have 166 rows in dataset
Finished scraping page 17, now have 186 rows in dataset
Finished scraping page 18, now have 206 rows in dataset
Finished scraping page 19, now have 224 rows in dataset
Finished scraping page 20, now have 244 rows in dataset
Finished scraping page 21, now have 261 rows in dataset
Finished scraping page 22, now have 278 rows in dataset
Finished scraping page 23, now have 295 rows in dataset
Finished scraping page 24, now have 308 rows in dataset
Finished scraping page 25, now have 324 rows in dataset

In [10]:
# getting a glimpse into how the DataFrame looks before NLP pre-processing
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3100 entries, 0 to 3099
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Name               3100 non-null   object
 1   Rating             3100 non-null   int64 
 2   Origin             3100 non-null   object
 3   Roast Level        3100 non-null   object
 4   Aroma              3100 non-null   int64 
 5   Acidity/Structure  3100 non-null   int64 
 6   Body               3100 non-null   int64 
 7   Flavor             3100 non-null   int64 
 8   Aftertaste         3100 non-null   int64 
 9   Review             3100 non-null   object
dtypes: int64(6), object(4)
memory usage: 242.3+ KB


Unnamed: 0,Name,Rating,Origin,Roast Level,Aroma,Acidity/Structure,Body,Flavor,Aftertaste,Review
0,El Salvador Finca Plan de Hoyo,93,"Apaneca growing region, El Salvador",Light,9,9,8,9,8,"A balanced, high-toned, cocoa-driven washed El..."
1,Finca Retana,93,"Antigua Department, Guatemala",Medium-Light,9,9,8,9,8,"A deeply floral, richly chocolaty Guatemala cu..."
2,Finca San Ramón,92,"San Juan Sacatepequez, Sacatepequez Department...",Medium-Light,8,9,8,9,8,"A bright, balanced, juicy Guatemala Geisha Mal..."
3,Todos Santos Cuchumatanán Pacamara,92,"Todos Santos Cuchumatán, Huehuetenango Departm...",Medium-Light,8,8,9,9,8,"A sweetly nut-toned, gently floral Guatemala P..."
4,Finca Huixoc Pacamara,91,"La Democracia, Huehuetenango Department, Guate...",Medium-Light,8,8,8,9,8,"A friendly, accessible Guatemala Pacamara, cri..."


In [None]:
# NLP pre-processing for review_text

# define set of stopwords
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def cleanReviewText(text):
    # make all words lowercase
    text = text.lower()
    # remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    # remove numbers
    text = re.sub(r'\d+', '', text)
    # tokenize
    tokens = word_tokenize(text)
    # remove stopwords
    new_tokens = [token for token in tokens if token not in stop_words]
    # lemmatization
    lemmas = [lemmatizer.lemmatize(token) for token in new_tokens]

    return " ".join(lemmas)

df["Cleaned_Review"] = df["Review"].apply(cleanReviewText)

In [None]:
# looking to ensure cleaned review text is correctly formatted
df.head()

In [None]:
# double checking there are no null values
df.isnull().sum()

In [None]:
# getting sum of how many unique values there are for each column
df.nunique()

In [None]:
# visualiz