# Introduction
This project will focus on data scrapping from tripadvisor reviews on Disneyland locations. Reviews for each locations will be converted into separate csv files. 

In [16]:
# import libraries to be used
import requests
from bs4 import BeautifulSoup
import re
from csv import writer

## 1. Tokyo Disneyland

In [17]:
# grab url from the website (tokyo disneyland reviews on tripadvisor)
url = "https://www.tripadvisor.com/Attraction_Review-g14134868-d320634-Reviews-Tokyo_Disneyland-Maihama_Urayasu_Chiba_Prefecture_Kanto.html"
# add headers found from stackoverflow to prevent code from stopping (https://stackoverflow.com/questions/71937012/python-web-scraper-not-working-for-tripadvisor)
headers = {'User-Agent': 'Mozilla/5.0'}
page = requests.get(url, headers=headers)

In [18]:
# create soup variable
soup = BeautifulSoup(page.content, "html.parser")
# create result variable that grabs each review cards
results = soup.find_all("div", class_="_c")

In [19]:
# using regex module to remove numbers from particular results
pattern = r"[0-9]"
review_num = 0
# create csv file
with open("Disneyland_Tokyo_Reviews_proto.csv", "w", newline = "", encoding = "utf8") as f:
    thewriter = writer(f)
    header = ["Review #", "Rating", "Year/Month", "Reviewer Location", "Review Title", "Review Text"]
    thewriter.writerow(header)
    # grabbing variables from the data
    for result in soup.select('#tab-data-qa-reviews-0 [data-automation="reviewCard"]'):
        rating = (result.select_one("svg[aria-label]")["aria-label"])[0:3] # wiped out the text part
        if (result.find("div", class_ = "RpeCd")) is not None:
            year_month = (result.find("div", class_ = "RpeCd")).text[0:8] # wiped out unnecessary text
        else: 
            year_month = "N/A"
        if result.find("div", class_ = "JINyA") is not None:
            reviewer_location = re.sub(pattern, "", (((result.find("div", class_ = "JINyA")).text)[:-13])) # regrex module / :-13 used to wipe out some unnecessary text
        else:
            reviewer_location = "N/A"
        if len(reviewer_location) < 3: # some reviews don't have location
            reviewer_location = "N/A"
        review_text = (result.find("div", class_ = "biGQs _P pZUbB KxBGd")).text
        review_title = (result.find("span", class_ = "yCeTE")).text
        review_num += 1
        each_review = [review_num, rating, year_month, reviewer_location, review_title, review_text]
        thewriter.writerow(each_review)

The csv file is created and working. But the code only captures the first 10 reviews on the website. I will loop the code for 325 times as there are 3241 written reviews at the time (2022.10.21). The file including the first 10 reviews is named "Disneyland_Tokyo_Reviews_proto.csv"

### 1.1 Tokyo Disneyland (final)

In [20]:
with open("Disneyland_Tokyo_Reviews.csv", "w", newline = "", encoding = "utf8") as f:
    thewriter = writer(f)
    header = ["Review #", "Rating", "Year/Month", "Reviewer Location", "Review Title", "Review Text"]
    thewriter.writerow(header)
    review_num = 0
    pattern = r"[0-9]"
    for page in range (0, 3250, 10):
        page = str(page)
        if page == 0:
            url = "https://www.tripadvisor.com/Attraction_Review-g14134868-d320634-Reviews-Tokyo_Disneyland-Maihama_Urayasu_Chiba_Prefecture_Kanto.html"
        else:
            url = "https://www.tripadvisor.com/Attraction_Review-g14134868-d320634-Reviews-or" + page + "-Tokyo_Disneyland-Maihama_Urayasu_Chiba_Prefecture_Kanto.html"
        headers = {'User-Agent': 'Mozilla/5.0'}
        page = requests.get(url, headers=headers)
        soup = BeautifulSoup(page.content, "html.parser")
        results = soup.find_all("div", class_="_c")
        for result in soup.select('#tab-data-qa-reviews-0 [data-automation="reviewCard"]'):
            rating = (result.select_one("svg[aria-label]")["aria-label"])[0:3] # wiped out the text part and converted into ints
            if (result.find("div", class_ = "RpeCd")) is not None:
                year_month = (result.find("div", class_ = "RpeCd")).text[0:8]
            else:
                year_month = "N/A"
            if result.find("div", class_ = "JINyA") is not None:
                reviewer_location = re.sub(pattern, "", (((result.find("div", class_ = "JINyA")).text)[:-13])) # regrex module / :-13 used to wipe out some unnecessary text
            else:
                reviewer_location = "N/A"
            if len(reviewer_location) < 3: # some reviews don't have location
                reviewer_location = "N/A"
            review_text = (result.find("div", class_ = "biGQs _P pZUbB KxBGd")).text
            review_title = (result.find("span", class_ = "yCeTE")).text
            review_num += 1
            each_review = [review_num, rating, year_month, reviewer_location, review_title, review_text]
            thewriter.writerow(each_review)
    

Now every written reviews as of now (2022.10.21) are stored in a CSV file named "Disneyland_tokyo_Revuews.csv". 

## 2. Disneyland Paris

The same process will be done with other locations.

Looped 1842 times (2022.10.21)

In [21]:
with open("Disneyland_Paris_Reviews.csv", "w", newline = "", encoding = "utf8") as f:
    thewriter = writer(f)
    header = ["Review #", "Rating", "Year/Month", "Reviewer Location", "Review Title", "Review Text"]
    thewriter.writerow(header)
    review_num = 0
    pattern = r"[0-9]"
    for page in range (0, 18420, 10):
        page = str(page)
        if page == 0:
            url = "https://www.tripadvisor.com/Attraction_Review-g226865-d189258-Reviews-Disneyland_Paris-Marne_la_Vallee_Seine_et_Marne_Ile_de_France.html"
        else:
            url = "https://www.tripadvisor.com/Attraction_Review-g226865-d189258-Reviews-or" + page + "-Disneyland_Paris-Marne_la_Vallee_Seine_et_Marne_Ile_de_France.html"
        headers = {'User-Agent': 'Mozilla/5.0'}
        page = requests.get(url, headers=headers)
        soup = BeautifulSoup(page.content, "html.parser")
        results = soup.find_all("div", class_="_c")
        for result in soup.select('#tab-data-qa-reviews-0 [data-automation="reviewCard"]'):
            rating = (result.select_one("svg[aria-label]")["aria-label"])[0:3] # wiped out the text part and converted into ints
            if (result.find("div", class_ = "RpeCd")) is not None:
                year_month = (result.find("div", class_ = "RpeCd")).text[0:8]
            else:
                year_month = "N/A"
            if result.find("div", class_ = "JINyA") is not None:
                reviewer_location = re.sub(pattern, "", (((result.find("div", class_ = "JINyA")).text)[:-13])) # regrex module / :-13 used to wipe out some unnecessary text
            else:
                reviewer_location = "N/A"
            if len(reviewer_location) < 3: # some reviews don't have location
                reviewer_location = "N/A"
            review_text = (result.find("div", class_ = "biGQs _P pZUbB KxBGd")).text
            review_title = (result.find("span", class_ = "yCeTE")).text
            review_num += 1
            each_review = [review_num, rating, year_month, reviewer_location, review_title, review_text]
            thewriter.writerow(each_review)

## 3. Hong Kong Disneyland

Looped 1233 times (2022.10.21)

In [22]:
with open("Disneyland_HongKong_Reviews.csv", "w", newline = "", encoding = "utf8") as f:
    thewriter = writer(f)
    header = ["Review #", "Rating", "Year/Month", "Reviewer Location", "Review Title", "Review Text"]
    thewriter.writerow(header)
    review_num = 0
    pattern = r"[0-9]"
    for page in range (0, 12330, 10):
        page = str(page)
        if page == 0:
            url = "https://www.tripadvisor.com/Attraction_Review-g294217-d543602-Reviews-Hong_Kong_Disneyland-Hong_Kong.html"
        else:
            url = "https://www.tripadvisor.com/Attraction_Review-g294217-d543602-Reviews-or" + page + "-Hong_Kong_Disneyland-Hong_Kong.html"
        headers = {'User-Agent': 'Mozilla/5.0'}
        page = requests.get(url, headers=headers)
        soup = BeautifulSoup(page.content, "html.parser")
        results = soup.find_all("div", class_="_c")
        for result in soup.select('#tab-data-qa-reviews-0 [data-automation="reviewCard"]'):
            rating = (result.select_one("svg[aria-label]")["aria-label"])[0:3] # wiped out the text part and converted into ints
            if (result.find("div", class_ = "RpeCd")) is not None:
                year_month = (result.find("div", class_ = "RpeCd")).text[0:8]
            else:
                year_month = "N/A"
            if result.find("div", class_ = "JINyA") is not None:
                reviewer_location = re.sub(pattern, "", (((result.find("div", class_ = "JINyA")).text)[:-13])) # regrex module / :-13 used to wipe out some unnecessary text
            else:
                reviewer_location = "N/A"
            if len(reviewer_location) < 3: # some reviews don't have location
                reviewer_location = "N/A"
            review_text = (result.find("div", class_ = "biGQs _P pZUbB KxBGd")).text
            review_title = (result.find("span", class_ = "yCeTE")).text
            review_num += 1
            each_review = [review_num, rating, year_month, reviewer_location, review_title, review_text]
            thewriter.writerow(each_review)

## 4. Disney California Adventure Park

Looped 1315 times (2022.10.21)

In [23]:
with open("Disneyland_California_Adventure_Park_Reviews.csv", "w", newline = "", encoding = "utf8") as f:
    thewriter = writer(f)
    header = ["Review #", "Rating", "Year/Month", "Reviewer Location", "Review Title", "Review Text"]
    thewriter.writerow(header)
    review_num = 0
    pattern = r"[0-9]"
    for page in range (0, 13150, 10):
        page = str(page)
        if page == 0:
            url = "https://www.tripadvisor.com/Attraction_Review-g29092-d186690-Reviews-Disney_California_Adventure_Park-Anaheim_California.html"
        else:
            url = "https://www.tripadvisor.com/Attraction_Review-g29092-d186690-Reviews-or" + page + "-Disney_California_Adventure_Park-Anaheim_California.html"
        headers = {'User-Agent': 'Mozilla/5.0'}
        page = requests.get(url, headers=headers)
        soup = BeautifulSoup(page.content, "html.parser")
        results = soup.find_all("div", class_="_c")
        for result in soup.select('#tab-data-qa-reviews-0 [data-automation="reviewCard"]'):
            rating = (result.select_one("svg[aria-label]")["aria-label"])[0:3] # wiped out the text part and converted into ints
            if (result.find("div", class_ = "RpeCd")) is not None:
                year_month = (result.find("div", class_ = "RpeCd")).text[0:8]
            else:
                year_month = "N/A"
            if result.find("div", class_ = "JINyA") is not None:
                reviewer_location = re.sub(pattern, "", (((result.find("div", class_ = "JINyA")).text)[:-13])) # regrex module / :-13 used to wipe out some unnecessary text
            else:
                reviewer_location = "N/A"
            if len(reviewer_location) < 3: # some reviews don't have location
                reviewer_location = "N/A"
            review_text = (result.find("div", class_ = "biGQs _P pZUbB KxBGd")).text
            review_title = (result.find("span", class_ = "yCeTE")).text
            review_num += 1
            each_review = [review_num, rating, year_month, reviewer_location, review_title, review_text]
            thewriter.writerow(each_review)

## 5. Shanghai Disneyland
Looped 141 times (2022.10.21)

In [24]:
with open("Disneyland_Shanghai_Reviews.csv", "w", newline = "", encoding = "utf8") as f:
    thewriter = writer(f)
    header = ["Review #", "Rating", "Year/Month", "Reviewer Location", "Review Title", "Review Text"]
    thewriter.writerow(header)
    review_num = 0
    pattern = r"[0-9]"
    for page in range (0, 1410, 10):
        page = str(page)
        if page == 0:
            url = "https://www.tripadvisor.com/Attraction_Review-g308272-d10383031-Reviews-Shanghai_Disneyland-Shanghai.html"
        else:
            url = "https://www.tripadvisor.com/Attraction_Review-g308272-d10383031-Reviews-or" + page + "-Shanghai_Disneyland-Shanghai.html"
        headers = {'User-Agent': 'Mozilla/5.0'}
        page = requests.get(url, headers=headers)
        soup = BeautifulSoup(page.content, "html.parser")
        results = soup.find_all("div", class_="_c")
        for result in soup.select('#tab-data-qa-reviews-0 [data-automation="reviewCard"]'):
            rating = (result.select_one("svg[aria-label]")["aria-label"])[0:3] # wiped out the text part and converted into ints
            if (result.find("div", class_ = "RpeCd")) is not None:
                year_month = (result.find("div", class_ = "RpeCd")).text[0:8]
            else:
                year_month = "N/A"
            if result.find("div", class_ = "JINyA") is not None:
                reviewer_location = re.sub(pattern, "", (((result.find("div", class_ = "JINyA")).text)[:-13])) # regrex module / :-13 used to wipe out some unnecessary text
            else:
                reviewer_location = "N/A"
            if len(reviewer_location) < 3: # some reviews don't have location
                reviewer_location = "N/A"
            review_text = (result.find("div", class_ = "biGQs _P pZUbB KxBGd")).text
            review_title = (result.find("span", class_ = "yCeTE")).text
            review_num += 1
            each_review = [review_num, rating, year_month, reviewer_location, review_title, review_text]
            thewriter.writerow(each_review)

# Limitations
- Cannot scrap the data automatically; should check how many reviews are there. 
- Could be better ways of coding.
- Result crashes if review is not written in English