In [1]:
### Developed by Renato Cezar, based on the script "getting_started.ipynb" supplied by THE FORAGE
### to perform an on the job training for BRITISH AIRWAYS
### Last modification: May 13th 2023

In [1]:
import requests # For web page scrapping
from bs4 import BeautifulSoup # For HTML parsing and handling
import pandas as pd # For data frames and data series
import re as re # For RegEx
import datetime # For date and time handling
from dateutil.parser import parse # For date and time parsing 
import nbimporter # To use outter python codes and notebooks

In [3]:
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
# Set the number of pages to be scraped
pages = 20
# Set the number of registers per page
page_size = 100

parsed_content = []
lists_survey = []

# Loop to scrap content from each page:
for i in range(1, pages + 1):

    print(f"Scraping page {i}")

    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    # Collect HTML data from this page
    response = requests.get(url)

    # Pre Parse content
    content = response.content
    pre_parsed_content = BeautifulSoup(content, 'html.parser')
    
    # Split the articles from Pre Parsed Content and store them in a list of lists
    for article in pre_parsed_content.find_all("article", {"itemprop": "review"}):
        parsed_content.append(article)
    
    # Print the progress
    print(f"   ---> {len(parsed_content)} total reviews")

Scraping page 1
   ---> 100 total reviews
Scraping page 2
   ---> 200 total reviews
Scraping page 3
   ---> 300 total reviews
Scraping page 4
   ---> 400 total reviews
Scraping page 5
   ---> 500 total reviews
Scraping page 6
   ---> 600 total reviews
Scraping page 7
   ---> 700 total reviews
Scraping page 8
   ---> 800 total reviews
Scraping page 9
   ---> 900 total reviews
Scraping page 10
   ---> 1000 total reviews
Scraping page 11
   ---> 1100 total reviews
Scraping page 12
   ---> 1200 total reviews
Scraping page 13
   ---> 1300 total reviews
Scraping page 14
   ---> 1400 total reviews
Scraping page 15
   ---> 1500 total reviews
Scraping page 16
   ---> 1600 total reviews
Scraping page 17
   ---> 1700 total reviews
Scraping page 18
   ---> 1800 total reviews
Scraping page 19
   ---> 1900 total reviews
Scraping page 20
   ---> 2000 total reviews


In [19]:
# Section to transform the data mass in parsed_content in a handsome dataset to be analyzed 
lists_survey = []
for i in range(0, len(parsed_content) - 1):
    # Gathering and organizing the data from the scrap
    # Date of survey
    date = parsed_content[i].find("time").contents[0]
    # Subject given by the surveyed
    subject = parsed_content[i].find(attrs={"class": "text_header"}).contents[0]
    # Name of the surveyed 
    name = parsed_content[i].find(attrs={"itemprop": "name"}).contents[0]
    # Country of the surveyed
    country = parsed_content[i].find("h3").contents[2]
    # Verification status of the surveyed trip 
    try:
        trip_verification = parsed_content[i].find("em").contents[0]
    except AttributeError:
        trip_verification = "NotAvailable" 
    # Commentary of the surveyed
    try:
        commentary = parsed_content[i].find(attrs={"class": "text_content"}).contents[2]
    except IndexError:
        try:
            commentary = parsed_content[i].find(attrs={"class": "text_content"}).contents[1]
        except IndexError:
            commentary = parsed_content[i].find(attrs={"class": "text_content"}).contents[0]
    # Aircraft model
    try:
        aircraft = parsed_content[i].find(attrs={"class": "review-rating-header aircraft"}).parent
        aircraft = aircraft.find(attrs={"class": "review-value"}).contents[0]
    except AttributeError:
        aircraft = "NotInformed"
    # Type of traveller
    try:
        type_traveller = parsed_content[i].find(attrs={"class": "review-rating-header type_of_traveller"}).parent
        type_traveller = type_traveller.find(attrs={"class": "review-value"}).contents[0]
    except AttributeError:
        seat_type = "NotInformed"
    # Seat/cabin type purchased
    try:
        seat_type = parsed_content[i].find(attrs={"class": "review-rating-header cabin_flown"}).parent
        seat_type =  seat_type.find(attrs={"class": "review-value"}).contents[0]
    except AttributeError:
        seat_type = "NotInformed"
    # Flight route
    try:
        route = parsed_content[i].find(attrs={"class": "review-rating-header route"}).parent
        route = route.find(attrs={"class": "review-value"}).contents[0]
        # Splitting the trip to catch separately "From", "To" and "Via"
        # First, verifity if theres is a "to"
        to = route.find(' to ') # if there is no "to", it will no be possible to securely splity this trip
        if to == -1: # So take all the string as "from"
            route_from = route
            route_to = "NotApplicable"
            route_via = "NotApplicable"
        else: # In case of existing a "to", check if there is a "via"
            via = route.find(' via ') # if there is no "via", just take the "from" and "to"
            if via == -1:
                route_from = route.split(" to ")[0]
                route_to = route.split(" to ")[1]
                route_via = "NotApplicable"
            else: # if there is a via, so take the "from", "to" and "via"
                route_from = route.split(" to ")[0]
                route_to_via = route.split(" to ")[1]
                route_to = route_to_via.split(" via ")[0]
                route_via = route.split(" via ")[1]
    except AttributeError:
        route_from = "NotInformed"
        route_to = "NotInformed"
        route_via = "NotInformed"
    # Date flown
    try:
        date_flown = parsed_content[i].find(attrs={"class": "review-rating-header date_flown"}).parent
        date_flown = date_flown.find(attrs={"class": "review-value"}).contents[0]
    except AttributeError:
        date_flown = "NotInformed"
    # Star ratings (coded to take only the maximum star rating)
    # Seat Comfort
    try:
        stars_seat_comfort = len((parsed_content[i].find(attrs={"class": "review-rating-header seat_comfort"}).parent).find_all("span", {"class": "star fill"}))
    except AttributeError:
        stars_seat_comfort = "NotEvaluated"
    # Cabin Staff Service
    try:      
        stars_cabin_staff_service = len((parsed_content[i].find(attrs={"class": "review-rating-header cabin_staff_service"}).parent).find_all("span", {"class": "star fill"}))
    except AttributeError:
        stars_cabin_staff_service = "NotEvaluated"    
    # Food and Beverage
    try:
        stars_food_beverage = len((parsed_content[i].find(attrs={"class": "review-rating-header food_and_beverages"}).parent).find_all("span", {"class": "star fill"}))
    except AttributeError:
        stars_food_beverage = "NotEvaluated"
    # Inflight Entertainment
    try:
        stars_inflight_entertainment = len((parsed_content[i].find(attrs={"class": "review-rating-header inflight_entertainment"}).parent).find_all("span", {"class": "star fill"}))
    except AttributeError:
        stars_inflight_entertainment = "NotEvaluated"
    # Ground Services
    try:
        stars_ground_services = len((parsed_content[i].find(attrs={"class": "review-rating-header ground_service"}).parent).find_all("span", {"class": "star fill"}))
    except AttributeError:
        stars_ground_services = "NotEvaluated"
    # WiFi and Connectivity
    try:
        stars_wifi_connectivity = len((parsed_content[i].find(attrs={"class": "review-rating-header wifi_and_connectivity"}).parent).find_all("span", {"class": "star fill"}))
    except AttributeError:
        stars_wifi_connectivity = "NotEvaluated"    
    # Value for Money
    try:
        stars_value_money = len((parsed_content[i].find(attrs={"class": "review-rating-header value_for_money"}).parent).find_all("span", {"class": "star fill"}))
    except AttributeError:
        stars_value_money = "NotEvaluated"
    # Recomended
    recommended = parsed_content[i].find(attrs={"class": "review-rating-header recommended"}).parent
    recommended = recommended.find(attrs={"class": re.compile("review-value rating-*")}).contents[0]

    # Storing the organized data as list in a set of lists
    lists_survey.append([date, subject, name, country, trip_verification, commentary, aircraft, type_traveller, 
                         seat_type, route_from, route_to, route_via, date_flown, stars_seat_comfort, 
                         stars_cabin_staff_service, stars_food_beverage, stars_inflight_entertainment, 
                         stars_ground_services, stars_wifi_connectivity, stars_value_money, recommended])

In [28]:
# Treating the text to remove any unnecessary chars (e.g: "|", "(" and ")") from Subject_Surveyer, Country_Surveyer and Commentary_Surveyer,
# formating the date in Survey_Date to the pattern "DD/MM/YYYY" and Date_Flown to "MM/YYYY"
for i in range(0, len(lists_survey)):
    #lists_survey[i][1] = lists_survey[i][1].replace('"', "")
    lists_survey[i][3] = lists_survey[i][3][lists_survey[i][3].find('(')+1 : lists_survey[i][3].find(')')]
    lists_survey[i][5] = lists_survey[i][5].replace("|", "")
    
    lists_survey[i][0] = parse(lists_survey[i][0]).strftime('%d/%m/%Y')
    lists_survey[i][12] = parse(lists_survey[i][12]).strftime('%m/%Y')

In [29]:
# Creating and populating the data frame with the se of lists to handle the survey data in a tabular way
dataframe_survey = pd.DataFrame(lists_survey, columns=['Survey_Date', "Subject_Surveyer", "Name_Surveyer", "Country_Surveyer", 
                                 "Trip_Verification", "Commentary_Surveyer", "Aircraft", "Traveller_Type", 
                                 "Seat_Type", "Route_From", "Route_To", "Route_Via", "Date_Flown", "Seat_Comfort", 
                                 "Cabin_Services", "Food_Beverage", "Inflight_Entertainment", "Ground_Services", 
                                 "WiFi_Connectivity", "Value_Money","Recommended"])

Unnamed: 0,Survey_Date,Subject_Surveyer,Name_Surveyer,Country_Surveyer,Trip_Verification,Commentary_Surveyer,Aircraft,Traveller_Type,Seat_Type,Route_From,...,Route_Via,Date_Flown,Seat_Comfort,Cabin_Services,Food_Beverage,Inflight_Entertainment,Ground_Services,WiFi_Connectivity,Value_Money,Recommended
0,08/05/2023,"""most appalling airline service""",C Peale,Austral,Trip Verified,I will never travel with British Airways ag...,Boeing 787-9,Solo Leisure,Business Class,Sydney,...,Singapore,05/2023,1,1,1,1,1,1,1,no
1,06/05/2023,"""Customer service is shocking""",Jason Wickert,United Kingd,Trip Verified,I am already in Portugal so contacted them ...,NotInformed,Solo Leisure,Business Class,Heathrow,...,NotApplicable,05/2023,1,1,1,1,1,1,1,no
2,06/05/2023,"""Avoid this airline""",S Dasirou,United Kingd,Trip Verified,Terrible. Avoid this airline. It’s now beco...,NotInformed,Business,Economy Class,London,...,NotApplicable,05/2023,NotEvaluated,NotEvaluated,NotEvaluated,NotEvaluated,1,NotEvaluated,1,no
3,01/05/2023,"""a lot of actual lies being told""",Alexander George,United Kingd,Trip Verified,"Despite being a gold member, the British Ai...",NotInformed,Business,Economy Class,New York,...,NotApplicable,04/2023,4,3,3,3,1,4,1,no
4,28/04/2023,"""premium price for very average service""",G Jones,United Kingd,Not Verified,Regarding the aircraft and seat: The busines...,Boeing 787,Couple Leisure,Business Class,Singapore,...,NotApplicable,04/2023,4,2,1,5,3,1,1,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,14/08/2016,"""friendly and professional""",B Richardson,United Kingd,NotAvailable,London Heathrow - Vancouver - London Heathr...,A380,Business,Economy Class,LHR,...,NotApplicable,08/2016,3,5,5,5,3,NotEvaluated,3,yes
1995,14/08/2016,"""A380 is unconscionably crammed""",W Jackson,United Stat,NotAvailable,Flew London Heathrow to Washington DC. The ...,A380,Couple Leisure,Business Class,LHR,...,NotApplicable,08/2016,1,3,3,4,3,NotEvaluated,3,no
1996,14/08/2016,"""no respect for economy travellers""",R Anderson,United Kingd,NotAvailable,"I fly this route 3-4 times a year, unfortun...",NotInformed,Solo Leisure,Economy Class,LHR,...,NotApplicable,08/2016,1,2,1,2,2,NotEvaluated,1,no
1997,13/08/2016,"""very friendly staff""",F Moix,Switzerla,NotAvailable,Flew Zurich to London Heathrow. Very friendly ...,A321,Solo Leisure,Economy Class,ZRH,...,NotApplicable,08/2016,5,5,5,NotEvaluated,5,NotEvaluated,5,yes


In [18]:
# Save the data set as an Excel file
dataframe_survey.to_excel("BA_reviews-" + datetime.datetime.now().strftime("%d%b%Y_%H%M%S") + ".xlsx", sheet_name="reviews", index=True)

In [15]:
%run -i BA-Helper.py
airport = iata_converter()

In [16]:
print(airport)

     IATA  ICAO               Airport_Name
0     AAA  NTGA                       Anaa
1     AAB  YARY                   Arrabury
2     AAC  HEAR                   El Arish
3     AAE  DABB  Rabah Bitat (Les Salines)
4     AAF  KAAF               Apalachicola
...   ...   ...                        ...
7800  ZYI  ZUZY                      Zunyi
7801  ZYL  VGSY                     Osmany
7802  ZZO  UHSO                  Zonalnoye
7803  ZZU  FWUU                      Mzuzu
7804  ZZV  KZZV                 Zanesville

[7805 rows x 3 columns]
