In [70]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import os

# BRITISH AIRWAYS WEB SCRAPING

In [274]:
base_url = 'https://www.airlinequality.com/airline-reviews/british-airways/page/'
total_pages = 355

# Create an empty list to store all the review data
all_data = []

# Loop through each page and scrape the reviews
for page in range(1, total_pages + 1):
    # Make a request to the page URL
    url = base_url + str(page) + '/'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the review elements on the page
    reviews = soup.find_all('div', class_='body')

    # Extract the desired information from each review
    for review in reviews:
        # Extract header if available, otherwise set to "NA"
        header = review.h2.text if review.h2 else np.nan
        
        # Extract country, date, and name if available, otherwise set to "NA"
        country_element = review.find("h3", class_="text_sub_header userStatusWrapper")
        country = country_element.span.nextSibling.replace("(", "").replace(")", "") if country_element else np.nan
        date = country_element.time.text if country_element else np.nan
        name = country_element.span.text if country_element else np.nan
        
        review_element = review.find("div", class_="text_content").text
        
        # Split review element into verification and review if available, otherwise set to "NA"
        split_elements = review_element.split(" | ")
        verification = split_elements[0] if len(split_elements) >= 1 else np.nan
        review_text = split_elements[1] if len(split_elements) >= 2 else np.nan
        
        # Extract type of traveller if available, otherwise set to "NA"
        type_of_traveller_element = review.find("td", class_="review-rating-header type_of_traveller")
        type_of_traveller = type_of_traveller_element.nextSibling.text if type_of_traveller_element else np.nan 
        
        # Extract seat type if available, otherwise set to "NA"
        seat_type_element = review.find("td", class_="review-rating-header cabin_flown")
        seat_type = seat_type_element.nextSibling.text if seat_type_element else np.nan
        
        # Extract route if available, otherwise set to "NA"
        route_element = review.find("td", class_="review-rating-header route")
        route = route_element.nextSibling.text if route_element else np.nan
        
        # Extract date flown if available, otherwise set to "NA"
        date_flown_element = review.find("td", class_="review-rating-header date_flown")
        date_flown = date_flown_element.nextSibling.text if date_flown_element else np.nan

        # Define a list of all category names in the desired order
        category_names = [
            'Seat Comfort',
            'Cabin Staff Service',
            'Food & Beverages',
            'Inflight Entertainment',
            'Ground Service',
            'Wifi & Connectivity',
            'Value For Money'
        ]

        # Extract the ratings for each category
        review_values = []
        for category_name in category_names:
            category_element = None
            for category in review.select('td.review-rating-header'):
                if category.text.strip() == category_name:
                    category_element = category
                    break
            if category_element:
                category_rating = None
                for rating in review.select('td.review-rating-stars.stars'):
                    if rating.find_previous('td', class_='review-rating-header').text.strip() == category_name:
                        category_rating = rating
                        break
                if category_rating:
                    stars = category_rating.select('span.star.fill')
                    last_star = stars[-1].text if stars else np.nan
                    review_values.append(last_star)
                else:
                    review_values.append(np.nan)
            else:
                review_values.append(np.nan)

        # Extract the individual ratings
        seat_comfort = review_values[0]
        cabin_staff_service = review_values[1]
        food_beverages = review_values[2]
        inflight_entertainment = review_values[3]
        ground_service = review_values[4]
        wifi_connectivity = review_values[5]
        value_for_money = review_values[6]
        
        # Extract the recommendation if available, otherwise set to "NA"
        recommended_element = review.find('td', class_="review-rating-header recommended" ).nextSibling
        recommended = recommended_element.text if recommended_element else np.nan

        data = {
            "name": name,
            "date": date,
            'country': country,
            'review_title': header,
            "verification": verification,
            "review": review_text,
            "type_of_traveller": type_of_traveller,
            "seat_type": seat_type,
            "route": route,
            "date_flown": date_flown,
            "seat_comfort": seat_comfort,
            "cabin_staff_service": cabin_staff_service,
            "food_beverages": food_beverages,
            "inflight_entertainment": inflight_entertainment,
            "ground_service": ground_service,
            "wifi_connectivity": wifi_connectivity,
            "value_for_money": value_for_money,
            "recommended": recommended
        }

        # Append the review data to the list
        all_data.append(data)

# Create a DataFrame from the list of review data
df = pd.DataFrame(all_data)

# Print the DataFrame
(df)


Unnamed: 0,name,date,country,review_title,verification,review,type_of_traveller,seat_type,route,date_flown,seat_comfort,cabin_staff_service,food_beverages,inflight_entertainment,ground_service,wifi_connectivity,value_for_money,recommended
0,\nN Mayle,19th May 2023,United States,"""BA is on the skids downhill""",✅ Trip Verified,Words fail to describe this last awful flight...,Solo Leisure,Business Class,London to San Francisco,September 2022,3,1,1,4,2,,2,no
1,\nE Heale,17th May 2023,United States,"""Absolutely terrible experience""",✅ Trip Verified,Absolutely terrible experience. The app would...,Solo Leisure,Economy Class,London to Dallas,April 2023,1,1,3,3,1,1,1,no
2,\nH Mike,17th May 2023,United Kingdom,"""poor service and unhappy customers""",✅ Trip Verified,BA overbook every flight to maximise their inc...,Business,Economy Class,London to Madrid,May 2023,4,3,1,,1,,1,no
3,\nRichard Cruise,17th May 2023,United Kingdom,"""just won't use them again""",✅ Trip Verified,"\r\nThe flights were all on time, except Belf...",Solo Leisure,Economy Class,London to Belfast,May 2023,1,3,3,1,1,1,1,no
4,\nKathi Blanning,14th May 2023,United States,"""Another bad show""",Not Verified,Only the second time flying BA as first time w...,Couple Leisure,Business Class,Los Angeles to London,May 2023,5,5,4,5,1,2,1,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3543,\nJ Tinning,29th August 2012,United Kingdom,British Airways customer review,Flew LHR - VIE return operated by bmi but BA a...,,,Economy Class,,,5,5,5,,,,4,yes
3544,\nNick Berry,28th August 2012,United Kingdom,British Airways customer review,LHR to HAM. Purser addresses all club passenge...,,,Business Class,,,4,5,4,,,,3,yes
3545,\nAvril Barclay,12th October 2011,United Kingdom,British Airways customer review,My son who had worked for British Airways urge...,,,Economy Class,,,,,,,,,4,yes
3546,\nC Volz,11th October 2011,United States,British Airways customer review,London City-New York JFK via Shannon on A318 b...,,,Premium Economy,,,1,3,5,,,,1,no


# SAVING DATA

In [6]:
os.getcwd()
os.chdir('C:\\Users\\sejal Jadev\\downloads')

In [None]:
df.to_excel("british_airways.xlsx")

In [7]:
data = pd.read_excel("british_airways.xlsx")

In [8]:
data.isna().sum()


Unnamed: 0                   0
name                         0
date                         0
country                      0
review_title                 0
verification                 0
review                    1528
type_of_traveller          770
seat_type                    2
route                      775
date_flown                 778
seat_comfort               104
cabin_staff_service        114
food_beverages             349
inflight_entertainment    1073
ground_service             840
wifi_connectivity         2996
value_for_money              1
recommended                  0
dtype: int64

In [9]:
data.shape

(3548, 19)