### Importing required libraries

In [4]:
#BeautifulSoup: A library used to parse HTML and extract data from web pages.
from bs4 import BeautifulSoup
#requests: A library to send HTTP requests to websites and get the HTML of web pages.
import requests
#pandas: A library used tool for data manipulation and analysis
import pandas
#csv: A module for writing and handling CSV (Comma Separated Values) files.
import csv
#re: Python’s module for regular expressions, used to match patterns in strings.
import re

### Data Mining

In [6]:
#this cell is a pulse check on whether the page has successfully pulled data.
#stores the webpage's URL you want to scrape in the variable url
url = "https://www.airlinequality.com/airline-reviews/british-airways"

#sends a request to access the webpage.
requests.get(url)

#fetches the webpage content and stores it in the variable page.
page= requests.get(url)

#parses the webpage content (page.text) into a structured format using BeautifulSoup, making it easier to work with HTML.
soup= BeautifulSoup(page.text, 'html')

#prints the structured HTML content of the webpage.
#print(soup)

In [8]:
#finds all instances of 'td' within the HTML
soup.find_all('td')

[<td class="review-rating-header food-beverages">Food &amp; Beverages</td>,
 <td class="review-rating-stars stars">
 <span class="star fill">1</span><span class="star fill">2</span><span class="star fill">3</span><span class="star">4</span><span class="star">5</span> </td>,
 <td class="review-rating-header inflight-entertainment">Inflight Entertainment</td>,
 <td class="review-rating-stars stars">
 <span class="star fill">1</span><span class="star fill">2</span><span class="star fill">3</span><span class="star">4</span><span class="star">5</span> </td>,
 <td class="review-rating-header seat-comfort">Seat Comfort</td>,
 <td class="review-rating-stars stars">
 <span class="star fill">1</span><span class="star fill">2</span><span class="star fill">3</span><span class="star">4</span><span class="star">5</span> </td>,
 <td class="review-rating-header staff-service">Staff Service</td>,
 <td class="review-rating-stars stars">
 <span class="star fill">1</span><span class="star fill">2</span><s

In [10]:
# Function to extract review data from a page
def extract_review_data(review):
    title = review.find('h2', class_="text_header").text.strip() if review.find('h2', class_="text_header") else "N/A"
    author = review.find('span', itemprop="name").text.strip() if review.find('span', itemprop="name") else "N/A"
    country = get_country(f"{review.find('h3').text}") if review.find('h3') else "Unknown"
    rating = review.find('span', itemprop="ratingValue").text.strip() if review.find('span', itemprop="ratingValue") else "N/A"
    overall_rating = review.find('span', itemprop="bestRating").text.strip() if review.find('span', itemprop="bestRating") else "N/A"
    date_published = review.find('meta', itemprop="datePublished")['content'] if review.find('meta', itemprop="datePublished") else "N/A"
    
    # Initialize default values
    type_of_traveller, seat_type, route, recommended = "N/A", "N/A", "N/A", "N/A"
    
    # Attempt to parse traveler type, seat type, route, and recommendation
    for row in review.find_all('tr'):
        header_td = row.find('td', class_="review-rating-header")
        value_td = row.find('td', class_="review-value")
        
        if header_td and value_td:
            header = header_td.text.strip()
            value = value_td.text.strip()
            if header == "Type Of Traveller":
                type_of_traveller = value
            elif header == "Seat Type":
                seat_type = value
            elif header == "Route":
                route = value
            elif header == "Recommended":
                recommended = value
    
    # Content extraction with a fallback for alternative class names or tag structures
    content = review.find('div', class_="text_content").text.strip() if review.find('div', class_="text_content") else "N/A"
    
    return [title, author, country, rating, overall_rating, date_published, type_of_traveller, seat_type, route, recommended, content]

In [12]:
# Define a function to extract country information
def get_country(author_text):
    if "(" in author_text and ")" in author_text:
        return author_text.split("(")[-1].split(")")[0].strip()
    return "Unknown"

# Main function to fetch reviews and save to CSV
def main():
    with open('BA_reviews_dataset.csv', 'w', encoding='utf8', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        header = ['Title', 'Author', 'Country', 'Rating', 'Overall_Rating', 'Date_Published',
                  'Type_of_Traveller', 'Seat_Type', 'Route', 'Recommended', 'Content']
        csvwriter.writerow(header)
        
        with requests.Session() as session:
            base_url = "https://www.airlinequality.com/airline-reviews/british-airways/page/"
            for page_number in range(1, 345):
                try:
                    response = session.get(f"{base_url}{page_number}", timeout=10)
                    soup = BeautifulSoup(response.content, 'html.parser')
                    reviews = soup.find_all('article', itemprop="review")
                    
                    data = []
                    for review in reviews:
                        data.append(extract_review_data(review))

                    csvwriter.writerows(data)
                    
                except requests.exceptions.RequestException as e:
                    print(f"Error on page {page_number}: {e}")

# Run the main function
if __name__ == "__main__":
    main()


Error on page 1: HTTPSConnectionPool(host='www.airlinequality.com', port=443): Read timed out. (read timeout=10)
Error on page 24: HTTPSConnectionPool(host='www.airlinequality.com', port=443): Read timed out. (read timeout=10)
Error on page 44: HTTPSConnectionPool(host='www.airlinequality.com', port=443): Read timed out. (read timeout=10)


In [14]:
review_df = pandas.read_csv('BA_reviews_dataset.csv')

In [16]:
review_df.head()

Unnamed: 0,Title,Author,Country,Rating,Overall_Rating,Date_Published,Type_of_Traveller,Seat_Type,Route,Recommended,Content
0,"""never fly with them again""",Erika Greyling,United Kingdom,1.0,10.0,2024-11-03,Couple Leisure,Economy Class,Munich to London Heathrow,no,✅ Trip Verified | I recently travelled from ...
1,"""still have not heard any updates""",S Wozniak,United States,3.0,10.0,2024-11-03,Couple Leisure,Premium Economy,Heathrow to Boston,no,Not Verified | I paid for seats 80 A and B on...
2,"""cabin crew were nice""",Barnaby Emmerson,United Kingdom,7.0,10.0,2024-11-03,Family Leisure,Economy Class,Los Angeles to London Heathrow,yes,"Not Verified | The flight wasn’t that bad, alt..."
3,"""support staff wash their hands of you""",Charlotte Parsons,United Kingdom,1.0,10.0,2024-11-02,Family Leisure,Premium Economy,Vancouver to London,no,✅ Trip Verified | I decided to treat myself a...
4,"""no fuss, no bother experience""",R. Wrightman,Canada,9.0,10.0,2024-11-02,Solo Leisure,Economy Class,Vancouver to Gatwick,yes,Not Verified | I was very impressed with thei...


In [18]:
review_df.dtypes

Title                 object
Author                object
Country               object
Rating               float64
Overall_Rating       float64
Date_Published        object
Type_of_Traveller     object
Seat_Type             object
Route                 object
Recommended           object
Content               object
dtype: object