### Scraping

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
# Function to get reviews from a specific page
def get_reviews(page_url, headers):
    response = requests.get(page_url, headers=headers)
    if response.status_code != 200:
        return None
    soup = BeautifulSoup(response.text, 'html.parser')
    reviews = soup.find_all('article', class_='ReviewCard')
    return reviews

# Initial URL
url = "https://www.goodreads.com/book/show/123372185-lolita-by-vladimir-nabokov"
user_agent = {'User-agent': 'Mozilla/5.0'} 
headers = {'User-Agent': 'Mozilla/5.0'}

# Initialize the review dictionary
review_dict = {'reviews':[], 'rating': []}

# Simulate pagination
for page_num in range(1, 200):  # Adjust the range based on how many pages you want to scrape
    page_url = f"{url}?page={page_num}"  # Modify this if the pagination parameter is different
    reviews = get_reviews(page_url, headers)
    
    if not reviews:
        break
    
    for review in reviews:
        # Find the review text 
        review_content = review.find('section', class_='ReviewText__content')
        if review_content:
            review_text = review_content.get_text(strip=True)
            review_dict['reviews'].append(review_text)
        else:
            review_dict['reviews'].append(None)
        
        # Find the rating 
        rating_element = review.find('span', class_="RatingStars RatingStars__small")
        if rating_element:
            rating = rating_element.get('aria-label')
            review_dict['rating'].append(rating)
        else:
            review_dict['rating'].append(None)
    
    print(f"Scraped page {page_num}")

# Convert the dictionary to a DataFrame
df = pd.DataFrame(review_dict)
print(df.head())

# Save the DataFrame to a CSV file
df.to_csv('scraped.csv', index=False)


In [None]:
df.shape

(5970, 2)

### Text Preprocessing

In [8]:
# load dataset
df_1 = pd.read_csv("scraped.csv")
df_1.head()

Unnamed: 0,reviews,rating
0,"Between the CoversAfter re-reading ""Lolita"", I...",Rating 5 out of 5
1,Nymph. Nymphet. Nymphetiquette. Nymphology. Ny...,Rating 5 out of 5
2,"Now, this is going to be embarrassing to admit...",Rating 5 out of 5
3,I wasn't even going to write a review ofLolita...,Rating 4 out of 5
4,"when i first read this book, i hated every sec...",Rating 4 out of 5


In [9]:
# Filter out non-English reviews
df_2 = df_1.copy()

from langdetect import detect

def is_eng(text):
    return detect(text) == 'en'

# add col: eng = True if english
df_2['eng'] = df_2['reviews'].apply(is_eng)
# filter out eng = False & remove eng column
df_2 = df_2[df_2['eng']==True].iloc[:, 0:3]

In [10]:
df_2.shape

(5211, 3)

In [11]:
df_2.head()

Unnamed: 0,reviews,rating,eng
0,"Between the CoversAfter re-reading ""Lolita"", I...",Rating 5 out of 5,True
1,Nymph. Nymphet. Nymphetiquette. Nymphology. Ny...,Rating 5 out of 5,True
2,"Now, this is going to be embarrassing to admit...",Rating 5 out of 5,True
3,I wasn't even going to write a review ofLolita...,Rating 4 out of 5,True
4,"when i first read this book, i hated every sec...",Rating 4 out of 5,True


In [12]:
# tokenize & lemmatize text
import nltk
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# load stopwords and words corpus
stop_words = set(stopwords.words('english'))
english_words = set(words.words())

# initialize  lemmatizer
lemmatizer = WordNetLemmatizer()

# function to clean review text
def clean_review_text(review_text):
    # tokenize reviews into words, remove stop words, and lemmatize
    # the words part removes words that were vectorized wrong
    cleaned_words = [lemmatizer.lemmatize(token) for token in nltk.word_tokenize(review_text.lower()) if token not in stop_words and token in english_words]
    
    # reconstruct the cleaned review text
    cleaned_review_text = ' '.join(cleaned_words)
    
    return cleaned_review_text

In [20]:
df_3 = df_2.copy()
# apply function to text
df_3['reviews'] = df_3['reviews'].apply(clean_review_text)

In [21]:
df_3 = df_3.iloc[:, :2]
df_3.shape

(5211, 2)

In [22]:
# Convert ratings var to numerical

ratings_dict = {"Rating 5 out of 5": 5, "Rating 4 out of 5": 4, "Rating 3 out of 5": 3,
                "Rating 2 out of 5": 2, "Rating 1 out of 5": 1}

df_3['rating']=df_3['rating'].replace(ratings_dict)

df_3.head()

Unnamed: 0,reviews,rating
0,local bookseller ever read firmly going either...,5.0
1,nymph nymphet never think year old way stain b...,5.0
2,going embarrassing know reading enjoying book ...,5.0
3,even going write review finishing honestly man...,4.0
4,first read book every second pride reader dist...,4.0


In [23]:
df_3.shape

(5211, 2)

In [29]:
df_3.to_csv("cleaned_goodreads_reviews.csv")

### TF-IDF

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
df = pd.read_csv("cleaned_goodreads_reviews.csv",index_col=0)

df.shape

(5211, 2)

In [27]:
# initialize vectorizer object
vectorizer = TfidfVectorizer()
# transform each review into row of tf-idf'ed features
matrix = vectorizer.fit_transform(df['reviews'])
# extract list of features
features = vectorizer.get_feature_names_out()

# combine in dataframe & create csv
df_tfidf = pd.DataFrame(matrix.toarray(), columns=features)
df_tfidf.to_csv("all_tfidf.csv")
df_tfidf.head()

Unnamed: 0,aback,abandon,ability,abject,able,abnormal,abortion,absolute,absolutely,absorbed,...,yearlong,yes,yet,york,young,younger,youth,youthful,zero,zone
0,0.024681,0.024681,0.0,0.0,0.016804,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.032448,0.02068,0.073342,0.0,0.0,0.024681
1,0.0,0.0,0.0,0.0,0.023182,0.0,0.0,0.028529,0.0,0.0,...,0.0,0.057057,0.079048,0.0,0.044763,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.084708,...,0.0,0.0,0.0,0.0,0.111367,0.070977,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
df_tfidf.shape

(5211, 2689)