In [1]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd

In [2]:
base_url = "https://www.airlinequality.com/airline-reviews/united-airlines"
pages = 10
page_size = 200

reviews_data = []
ratings = []

In [3]:
for i in range(1, pages + 1):
    print(f"Scraping page {i}")

    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    # Collect HTML data from this page
    response = requests.get(url)

    # Ensure the request was successful
    if response.status_code == 200:
        # Parse content
        content = response.content
        parsed_content = BeautifulSoup(content, 'html.parser')
        
        # Extract airline name for filename, assuming it's in a meta tag
        if i == 1:
            airline_name_tag = parsed_content.find(attrs={"itemprop": "name"})
    
        if airline_name_tag:
            airline_name = airline_name_tag.get_text(strip=True)  # or .get("content", "")
            airline_name = airline_name.replace(" ", "_").replace("/", "-")
        else:
            airline_name = "airline_reviews"
        
        for review_container in parsed_content.find_all("article", {"itemprop": "review"}):
            # Extract review
            review_text = review_container.find("div", {"class": "text_content"}).get_text()
            
            # Extract rating
            rating_table = review_container.find("table", {"class": "review-ratings"})
            rating_data = {}
            strong_data = {}

            for row in rating_table.find_all("tr"):
                try:
                    label = row.find("td", {"class": "review-rating-header"}).get_text(strip=True)
                    value_td = row.find("td", {"class": "review-value"})  # Targeting 'review-value' class
                    
                    if value_td and not value_td.find_parent(class_="review-info"):
                        # Extract text inside <strong> tags
                        strong_text = value_td.find("strong").get_text(strip=True) if value_td.find("strong") else None
                        strong_data[label] = strong_text
                        
                        # Extract and store td content as rating
                        value = value_td.get_text(strip=True)
                    else:
                        value = None
                    
                    rating_data[label] = value
                except AttributeError:
                    pass
            
            # Add to list
            reviews_data.append({
                "review_text": review_text,
                **strong_data,
                **rating_data
            })

        print(f"   ---> {len(reviews_data)} total reviews")
    else:
        print(f"Failed to retrieve page {i}, status code: {response.status_code}")

Scraping page 1
   ---> 200 total reviews
Scraping page 2
   ---> 400 total reviews
Scraping page 3
   ---> 600 total reviews
Scraping page 4
   ---> 800 total reviews
Scraping page 5
   ---> 1000 total reviews
Scraping page 6
   ---> 1200 total reviews
Scraping page 7
   ---> 1400 total reviews
Scraping page 8
   ---> 1600 total reviews
Scraping page 9
   ---> 1800 total reviews
Scraping page 10
   ---> 2000 total reviews


In [4]:
# Create DataFrame
reviews_df = pd.DataFrame(reviews_data)

In [5]:
reviews_df.head()

Unnamed: 0,review_text,Type Of Traveller,Seat Type,Route,Date Flown,Recommended,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Wifi & Connectivity,Value For Money,Aircraft
0,✅ Trip Verified | They canceled my flight on J...,Solo Leisure,Business Class,Tokyo to San Francisco,June 2023,no,,,,,,,,
1,Not Verified | A premium economy ticket comes...,Solo Leisure,Premium Economy,London to Chicago,October 2023,yes,,,,,,,,Boeing 767-300
2,✅ Trip Verified | I'd never recommend flying...,Solo Leisure,Economy Class,Munich to Minneapolis via Chicago,September 2023,no,,,,,,,,
3,✅ Trip Verified | I was flying from Houston to...,Couple Leisure,Economy Class,Houston to Munich,September 2023,no,,,,,,,,
4,Not Verified | Ground staff in Denver are rud...,Family Leisure,Economy Class,Memphis to Calgary via Denver,October 2023,no,,,,,,,,


In [6]:
# Specify the path
save_path = r"C:/Users/saisu/OneDrive/Desktop/Airline Sentimental Analysis/Airline Reviews"

In [7]:
# Check if the directory exists, if not, create it
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Save to CSV
reviews_df.to_csv(os.path.join(save_path, f"{airline_name}.csv"), index=False)