In [29]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Define the base URL and headers
base_url = "https://www.airlinequality.com/airline-reviews/british-airways/page/"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}

# Initialize an empty list to store review data
reviews = []

# Loop through the first 30 pages
for page in range(1, 31):
    url = f"{base_url}{page}/?sortby=post_date%3ADesc&pagesize=100"
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        print(f"Failed to retrieve page {page}")
        continue
    
    soup = BeautifulSoup(response.text, "html.parser")
    review_articles = soup.find_all("article", itemprop="review")
    
    for article in review_articles:
        try:
            header = article.find("h2", class_="text_header").text.strip()
            author = article.find("span", itemprop="name").text.strip()
            date = article.find("time", itemprop="datePublished").text.strip()
            place = article.find("h3", class_="text_sub_header").text.split("(")[1].split(")")[0].strip()
            content = article.find("div", class_="text_content").text.strip()
            rating = article.find("span", itemprop="ratingValue").text.strip()
            trip_verified = "Trip Verified" in content
            
            review_data = {"header": header, "author": author, "date": date, "place": place, "content": content, "rating": rating, "trip_verified": trip_verified}
            
            review_stats = article.find("div", class_="review-stats")
            if review_stats:
                for row in review_stats.find_all("tr"):
                    key = row.find("td", class_="review-rating-header").text.strip().lower().replace(" ", "_")
                    value = row.find("td", class_="review-value")
                    if value:
                        review_data[key] = value.text.strip()
                    else:
                        stars = len(row.find_all("span", class_="star fill"))
                        review_data[key] = stars
            
            reviews.append(review_data)
        except Exception as e:
            print(f"Error parsing a review: {e}")
    
    print(f"Scraped page {page}")
    time.sleep(2)  # Sleep to avoid getting blocked

# Convert to DataFrame and save to CSV
df = pd.DataFrame(reviews)
#df.to_csv("british_airways_reviews.csv", index=False)
df.to_csv(r'C:\Users\eddsw\OneDrive\Desktop\PORTFOLIO\Web scraping\british_airways_reviews.csv', index = False) 
print("Scraping complete. Data saved to british_airways_reviews.csv")


Scraped page 1
Scraped page 2
Scraped page 3
Scraped page 4
Scraped page 5
Scraped page 6
Scraped page 7
Scraped page 8
Scraped page 9
Scraped page 10
Scraped page 11
Scraped page 12
Scraped page 13
Scraped page 14
Scraped page 15
Scraped page 16
Scraped page 17
Scraped page 18
Scraped page 19
Scraped page 20
Scraped page 21
Scraped page 22
Scraped page 23
Scraped page 24
Scraped page 25
Scraped page 26
Scraped page 27
Scraped page 28
Scraped page 29
Scraped page 30
Scraping complete. Data saved to british_airways_reviews.csv
