In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Setting Up the Base URL
base_url = "https://www.airlinequality.com/airline-reviews/{}/?sortby=post_date%3ADesc&pagesize=100&page={}"

# Defining Airlines and Total Reviews
airlines = {
    "Qatar Airways": {"slug": "qatar-airways", "total_reviews": 2494},
    "Singapore Airlines": {"slug": "singapore-airlines", "total_reviews": 1611},
    "Emirates": {"slug": "emirates", "total_reviews": 2400},
    "ANA All Nippon Airways": {"slug": "ana-all-nippon-airways", "total_reviews": 596},
    "Cathay Pacific Airways": {"slug": "cathay-pacific-airways", "total_reviews": 1453},
    "Japan Airlines": {"slug": "japan-airlines", "total_reviews": 423},
    "Turkish Airlines": {"slug": "turkish-airlines", "total_reviews": 2654},
    "EVA Air": {"slug": "eva-air", "total_reviews": 648},
    "Air France": {"slug": "air-france", "total_reviews": 1392},
    "Swiss International Air Lines": {"slug": "swiss-international-air-lines", "total_reviews": 1089}
}

# Initializing Data Storage
data = {
    "Airline": [],
    "Date": [],
    "Review Body": [],
    "Type of Traveller": [],
    "Seat Type": [],
    "Route": [],
    "Date Flown": [],
    "Food & Beverages": [],
    "Entertainment": [],
    "Seat Comfort": [],
    "Cabin Staff Service": [],
    "Value For Money": [],
    "Recommended": []
}

#Defining the scrape_reviews Function
def scrape_reviews(airline_name, slug, total_reviews):
    page = 1
    reviews_per_page = 100
    total_pages = (total_reviews // reviews_per_page) + (1 if total_reviews % reviews_per_page > 0 else 0)
    total_reviews_scraped = 0  # Counter for total reviews
    
    for page in range(1, total_pages + 1):
        print(f"Scraping reviews from: {base_url.format(slug, page)}")
        response = requests.get(base_url.format(slug, page)) # Requesting the Web Page
        
        if response.status_code != 200:
            print(f"Failed to retrieve data from {base_url.format(slug, page)}")
            break
        
        content = response.content
        parsed_content = BeautifulSoup(content, 'html.parser') # Parsing the HTML Content

        review_elements = parsed_content.find_all("div", class_="body") # Parsing the HTML Content

        if not review_elements:
            print("No more reviews found or reached the end of available reviews.")
            break
        
        for review in review_elements:
            details_dict = {}
            details_table = review.find("table", class_="review-ratings")

            # Extract review body text
            review_body = review.find("div", class_="text_content").get_text(strip=True) if review.find("div", class_="text_content") else "No review text"

            # Extract review date
            review_date = review.find("time").get_text(strip=True) if review.find("time") else "Unknown date"

            if details_table:
                rows = details_table.find_all("tr")
                for row in rows:
                    header = row.find("td", class_="review-rating-header")
                    value = row.find("td", class_="review-value") or row.find("td", class_="review-rating-stars")

                    if header and value:
                        key = header.get_text(strip=True)
                        if "stars" in value['class']:
                            filled_stars = len(value.find_all("span", class_="star fill"))
                            details_dict[key] = filled_stars
                        else:
                            details_dict[key] = value.get_text(strip=True)

            # Store data to respective lists
            data["Airline"].append(airline_name)
            data["Date"].append(review_date)
            data["Review Body"].append(review_body)
            data["Type of Traveller"].append(details_dict.get("Type Of Traveller", "None"))
            data["Seat Type"].append(details_dict.get("Seat Type", "None"))
            data["Route"].append(details_dict.get("Route", "None"))
            data["Date Flown"].append(details_dict.get("Date Flown", "None"))
            data["Food & Beverages"].append(details_dict.get("Food & Beverages", "None"))
            data["Entertainment"].append(details_dict.get("Inflight Entertainment", "None"))
            data["Seat Comfort"].append(details_dict.get("Seat Comfort", "None"))
            data["Cabin Staff Service"].append(details_dict.get("Cabin Staff Service", "None"))
            data["Value For Money"].append(details_dict.get("Value For Money", "None"))
            data["Recommended"].append(details_dict.get("Recommended", "no"))

            total_reviews_scraped += 1  # Increment the counter for total reviews
            
        print(f"Scraped {len(review_elements)} reviews from page {page}.")
        
        # If less than 100 reviews were found, we have reached the last page
        if len(review_elements) < reviews_per_page:
            break
            
    print(f"Total reviews scraped for {airline_name}: {total_reviews_scraped}")

# Loop over the airlines and scrape reviews
for airline_name, details in airlines.items():
    scrape_reviews(airline_name, details["slug"], details["total_reviews"])

# Save Data to a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('airline_reviews.csv', index=False)
print(f"Total number of reviews scraped across all airlines: {len(df)}")

# Print the first few rows of the data
print(df.head())


Scraping reviews from: https://www.airlinequality.com/airline-reviews/qatar-airways/?sortby=post_date%3ADesc&pagesize=100&page=1
Scraped 100 reviews from page 1.
Scraping reviews from: https://www.airlinequality.com/airline-reviews/qatar-airways/?sortby=post_date%3ADesc&pagesize=100&page=2
Scraped 100 reviews from page 2.
Scraping reviews from: https://www.airlinequality.com/airline-reviews/qatar-airways/?sortby=post_date%3ADesc&pagesize=100&page=3
Scraped 100 reviews from page 3.
Scraping reviews from: https://www.airlinequality.com/airline-reviews/qatar-airways/?sortby=post_date%3ADesc&pagesize=100&page=4
Scraped 100 reviews from page 4.
Scraping reviews from: https://www.airlinequality.com/airline-reviews/qatar-airways/?sortby=post_date%3ADesc&pagesize=100&page=5
Scraped 100 reviews from page 5.
Scraping reviews from: https://www.airlinequality.com/airline-reviews/qatar-airways/?sortby=post_date%3ADesc&pagesize=100&page=6
Scraped 100 reviews from page 6.
Scraping reviews from: https