In [220]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import math
import time


payload = { 'api_key': 'b5d04add6800e422b153f4a35019f736', 'url': 'https://www.tripadvisor.com/Attraction_Review-g308272-d10383031-Reviews-Shanghai_Disneyland-Shanghai.html', 'autoparse': 'true' }


In [346]:
def extract_reviews_and_ratings(url: str, api_key: str,total_reviews:int,starting_page:int=0) -> pd.DataFrame:
    """Fetches and parses review text and ratings from all TripAdvisor pages."""
    
    reviews_data = []
    reviews_per_page = 10  # Typical number of reviews per page on TripAdvisor

    # First, scrape the first page to get the total review count
    payload = {
        'api_key': api_key,
        'url': url,
        'autoparse': 'true'
    }
    response = requests.get('https://api.scraperapi.com/', params=payload)

    if response.status_code != 200:
        print("Failed to retrieve the first page. Status code:", response.status_code)
        return pd.DataFrame(), pd.DataFrame()

    # Parse the HTML content to find the total number of reviews
    soup = BeautifulSoup(response.content, 'html.parser')

    # Calculate total pages required
    total_pages = math.ceil(total_reviews / reviews_per_page)

    # Iterate through all pages
    for page_num in range(starting_page,total_pages):
        # Generate the page-specific URL
        if page_num == 0:
            page_url = url  # First page (no offset in the URL)
        else:
            offset = page_num * reviews_per_page
            page_url = url.replace("-Reviews-", f"-Reviews-or{offset}-")
        
        # Make the request for the current page
        payload['url'] = page_url
        response = requests.get('https://api.scraperapi.com/', params=payload)

        # Sleep for 2 seconds to avoid hitting rate limits
        time.sleep(5)

        if response.status_code != 200:
            print(f"Failed to retrieve page {page_num + 1}. Status code:", response.status_code)
            continue  # Skip to the next page if the current one fails

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all divs with class '_c' as the starting point for reviews
        all_text_blocks = soup.find_all('div', class_='_c')
        
        for block in all_text_blocks:
            # Extract review text
            review_text_span = block.find('span', class_='JguWG')
            review_text = None
            if review_text_span:
                nested_text = review_text_span.find('span', class_='yCeTE')
                if nested_text:
                    review_text = nested_text.get_text(strip=True)
            
            #extract rating score        
            rating_blocks=block.find('svg', class_='UctUV d H0')
            if rating_blocks:
                rating_text=rating_blocks.find('title').text.strip()
                rating_score = float(rating_text.split(" ")[0])  # Convert to flo

            # Append to reviews_data if review text found
            if review_text and rating_score is not None:
                reviews_data.append({
                    'review': review_text,
                    'rating':rating_score
                })
        
        print(f"Page {page_num + 1}/{total_pages} scraped successfully.")

    # Combine the reviews and ratings into DataFrames and return
    reviews_df = pd.DataFrame(reviews_data)
    
    return reviews_df


In [365]:
# Example usage 
url = "https://www.tripadvisor.com/Attraction_Review-g308272-d10383031-Reviews-Shanghai_Disneyland-Shanghai.html"
api_key = "b5d04add6800e422b153f4a35019f736"  # Replace with your ScraperAPI key
total_reviews=500
starting_page=0
reviews_df=extract_reviews_and_ratings(url, api_key, total_reviews,starting_page)

Page 1/50 scraped successfully.
Page 2/50 scraped successfully.
Page 3/50 scraped successfully.
Page 4/50 scraped successfully.
Page 5/50 scraped successfully.
Page 6/50 scraped successfully.
Page 7/50 scraped successfully.
Page 8/50 scraped successfully.
Page 9/50 scraped successfully.
Page 10/50 scraped successfully.
Page 11/50 scraped successfully.
Page 12/50 scraped successfully.
Page 13/50 scraped successfully.
Page 14/50 scraped successfully.
Page 15/50 scraped successfully.
Page 16/50 scraped successfully.
Page 17/50 scraped successfully.
Page 18/50 scraped successfully.
Page 19/50 scraped successfully.
Page 20/50 scraped successfully.
Page 21/50 scraped successfully.
Page 22/50 scraped successfully.
Page 23/50 scraped successfully.
Page 24/50 scraped successfully.
Page 25/50 scraped successfully.
Page 26/50 scraped successfully.
Page 27/50 scraped successfully.
Page 28/50 scraped successfully.
Page 29/50 scraped successfully.
Page 30/50 scraped successfully.
Page 31/50 scraped 

In [371]:
total_reviews=2330
starting_page=50
reviews_50=extract_reviews_and_ratings(url, api_key, total_reviews,starting_page)

Page 51/233 scraped successfully.
Page 52/233 scraped successfully.
Page 53/233 scraped successfully.
Page 54/233 scraped successfully.
Page 55/233 scraped successfully.
Page 56/233 scraped successfully.
Page 57/233 scraped successfully.
Page 58/233 scraped successfully.
Page 59/233 scraped successfully.
Page 60/233 scraped successfully.
Page 61/233 scraped successfully.
Page 62/233 scraped successfully.
Page 63/233 scraped successfully.
Page 64/233 scraped successfully.
Page 65/233 scraped successfully.
Page 66/233 scraped successfully.
Page 67/233 scraped successfully.
Page 68/233 scraped successfully.
Page 69/233 scraped successfully.
Page 70/233 scraped successfully.
Page 71/233 scraped successfully.
Page 72/233 scraped successfully.
Page 73/233 scraped successfully.
Page 74/233 scraped successfully.
Page 75/233 scraped successfully.
Page 76/233 scraped successfully.
Page 77/233 scraped successfully.
Page 78/233 scraped successfully.
Page 79/233 scraped successfully.
Page 80/233 sc

In [377]:
final_reviews=pd.concat([reviews_df,reviews_50],axis=0).reset_index(drop=True)

In [393]:
final_reviews.to_csv('reviews_df.csv',index=False)