## Import necessary libraries

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Scraping content from website using BeautifulSoup

In [12]:
base_url = "https://www.airlinequality.com/airline-reviews/british-airways/"
pages = 35
page_size = 100

reviews = [] # array to hold the reviews

#loop the through the page to collect data
for i in range(1, pages + 1):
    print(f'Scraping page {i}')
    
    # URL to collect links from paginated data
    url = f'{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}'
    
    # Collecting HTML data from url page
    response = requests.get(url)
    
    # Parse content, get content from html data into content and parse to parsed_content with beautifulsoup
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    
    # Collect all necessary elements
    para = parsed_content.find_all("div", {"class": "body"})
    
    # loop through parsed content and 
    for par in para:
        header = par.find("h2", class_= "text_header").text.replace("\n", " ")
        sub_header = par.find("h3", class_= "text_sub_header").text.replace("\n", " ")
        content = par.find("div", class_= "text_content").text.replace("\n", " ")
        
        # add the received text to reviews array
        reviews.append([header, sub_header, content])
        
    # print the review for each page after appending
    print(f'  ---> {len(reviews)} total reviews')

Scraping page 1
  ---> 100 total reviews
Scraping page 2
  ---> 200 total reviews
Scraping page 3
  ---> 300 total reviews
Scraping page 4
  ---> 400 total reviews
Scraping page 5
  ---> 500 total reviews
Scraping page 6
  ---> 600 total reviews
Scraping page 7
  ---> 700 total reviews
Scraping page 8
  ---> 800 total reviews
Scraping page 9
  ---> 900 total reviews
Scraping page 10
  ---> 1000 total reviews
Scraping page 11
  ---> 1100 total reviews
Scraping page 12
  ---> 1200 total reviews
Scraping page 13
  ---> 1300 total reviews
Scraping page 14
  ---> 1400 total reviews
Scraping page 15
  ---> 1500 total reviews
Scraping page 16
  ---> 1600 total reviews
Scraping page 17
  ---> 1700 total reviews
Scraping page 18
  ---> 1800 total reviews
Scraping page 19
  ---> 1900 total reviews
Scraping page 20
  ---> 2000 total reviews
Scraping page 21
  ---> 2100 total reviews
Scraping page 22
  ---> 2200 total reviews
Scraping page 23
  ---> 2300 total reviews
Scraping page 24
  ---> 2400 

In [13]:
df = pd.DataFrame(reviews)
df.columns = ["Review", "User Info", "Review_Details"]
df.head(10)

Unnamed: 0,Review,User Info,Review_Details
0,"""worst business class experience""",I Carsen (Australia) 5th March 2024,✅ Trip Verified | The worst business class ex...
1,"""it's truly awful for short-haul""",L Kelly (Canada) 4th March 2024,Not Verified | Quite possibly the worst busin...
2,"""never be flying with BA again""",Jana Chua (Singapore) 4th March 2024,Not Verified | I will never be flying with BA...
3,"""it was extremely underwhelming""",E Gayerlo (United Kingdom) 4th March 2024,✅ Trip Verified | On the my trip to Mexico Ci...
4,"""an excellent experience""",T Collins (United Kingdom) 2nd March 2024,✅ Trip Verified | I upgraded at check in to C...
5,"""they refused to fly me back!",Armando Castro (Australia) 1st March 2024,✅ Trip Verified | I bought a return trip with...
6,"""Poor from start to finish""",Simon Channon (United Kingdom) 29th February...,✅ Trip Verified | Poor from start to finish. ...
7,“customer service non existent”,C Allen (United Kingdom) 29th February 2024,✅ Trip Verified | Communication and customer s...
8,"""no better than the worst of the low cost""",Peter Hansell (United Kingdom) 23rd February...,✅ Trip Verified | That was supposed to be my ...
9,"""could not have been better""",Caleb Lowe (United Kingdom) 21st February 2024,✅ Trip Verified | Have no fear when your BA f...


In [14]:
df.to_csv('data/3500_BA_reviews.csv')

## Preprocess Text

In [15]:
import re

In [16]:
# Remove unwanted text 1:✅ Trip Verified|
df.replace(re.compile(r'\s*✅ Trip Verified \|\s*'), "", inplace=True)
df

Unnamed: 0,Review,User Info,Review_Details
0,"""worst business class experience""",I Carsen (Australia) 5th March 2024,The worst business class experience. Ground cr...
1,"""it's truly awful for short-haul""",L Kelly (Canada) 4th March 2024,Not Verified | Quite possibly the worst busin...
2,"""never be flying with BA again""",Jana Chua (Singapore) 4th March 2024,Not Verified | I will never be flying with BA...
3,"""it was extremely underwhelming""",E Gayerlo (United Kingdom) 4th March 2024,"On the my trip to Mexico City, I had the oppor..."
4,"""an excellent experience""",T Collins (United Kingdom) 2nd March 2024,I upgraded at check in to Club Europe seat 1D ...
...,...,...,...
3495,British Airways customer review,Julie Jacoby (Australia) 14th September 2014,Flew SYD-SIN and had a reasonably comfortable ...
3496,British Airways customer review,J Henson (United Kingdom) 14th September 2014,Madrid-London on 9 September on Boeing 767 air...
3497,British Airways customer review,Paula Loguda (United States) 14th September ...,9/8/14 LHR-EWR in WC. Clean aircraft but moder...
3498,British Airways customer review,B Kennedy (United Kingdom) 14th September 2014,Flew London City to Malaga return 28th August ...


In [18]:
# Remove unwanted text 2:✅ Verified Review|
df.replace(re.compile(r'\s*✅ Verified Review \|\s*'), "", inplace=True)
df

Unnamed: 0,Review,User Info,Review_Details
0,"""worst business class experience""",I Carsen (Australia) 5th March 2024,The worst business class experience. Ground cr...
1,"""it's truly awful for short-haul""",L Kelly (Canada) 4th March 2024,Not Verified | Quite possibly the worst busin...
2,"""never be flying with BA again""",Jana Chua (Singapore) 4th March 2024,Not Verified | I will never be flying with BA...
3,"""it was extremely underwhelming""",E Gayerlo (United Kingdom) 4th March 2024,"On the my trip to Mexico City, I had the oppor..."
4,"""an excellent experience""",T Collins (United Kingdom) 2nd March 2024,I upgraded at check in to Club Europe seat 1D ...
...,...,...,...
3495,British Airways customer review,Julie Jacoby (Australia) 14th September 2014,Flew SYD-SIN and had a reasonably comfortable ...
3496,British Airways customer review,J Henson (United Kingdom) 14th September 2014,Madrid-London on 9 September on Boeing 767 air...
3497,British Airways customer review,Paula Loguda (United States) 14th September ...,9/8/14 LHR-EWR in WC. Clean aircraft but moder...
3498,British Airways customer review,B Kennedy (United Kingdom) 14th September 2014,Flew London City to Malaga return 28th August ...


In [19]:
# Remove unwanted text 3:Not Verified|
df.replace(re.compile(r'\s*Not Verified \|\s*'), "", inplace=True)
df

Unnamed: 0,Review,User Info,Review_Details
0,"""worst business class experience""",I Carsen (Australia) 5th March 2024,The worst business class experience. Ground cr...
1,"""it's truly awful for short-haul""",L Kelly (Canada) 4th March 2024,Quite possibly the worst business class I have...
2,"""never be flying with BA again""",Jana Chua (Singapore) 4th March 2024,I will never be flying with BA again. This is ...
3,"""it was extremely underwhelming""",E Gayerlo (United Kingdom) 4th March 2024,"On the my trip to Mexico City, I had the oppor..."
4,"""an excellent experience""",T Collins (United Kingdom) 2nd March 2024,I upgraded at check in to Club Europe seat 1D ...
...,...,...,...
3495,British Airways customer review,Julie Jacoby (Australia) 14th September 2014,Flew SYD-SIN and had a reasonably comfortable ...
3496,British Airways customer review,J Henson (United Kingdom) 14th September 2014,Madrid-London on 9 September on Boeing 767 air...
3497,British Airways customer review,Paula Loguda (United States) 14th September ...,9/8/14 LHR-EWR in WC. Clean aircraft but moder...
3498,British Airways customer review,B Kennedy (United Kingdom) 14th September 2014,Flew London City to Malaga return 28th August ...


In [20]:
# Remove unwanted text 4:Unverified|
df.replace(re.compile(r'\s*Unverified \|\s*'), "", inplace=True)
df

Unnamed: 0,Review,User Info,Review_Details
0,"""worst business class experience""",I Carsen (Australia) 5th March 2024,The worst business class experience. Ground cr...
1,"""it's truly awful for short-haul""",L Kelly (Canada) 4th March 2024,Quite possibly the worst business class I have...
2,"""never be flying with BA again""",Jana Chua (Singapore) 4th March 2024,I will never be flying with BA again. This is ...
3,"""it was extremely underwhelming""",E Gayerlo (United Kingdom) 4th March 2024,"On the my trip to Mexico City, I had the oppor..."
4,"""an excellent experience""",T Collins (United Kingdom) 2nd March 2024,I upgraded at check in to Club Europe seat 1D ...
...,...,...,...
3495,British Airways customer review,Julie Jacoby (Australia) 14th September 2014,Flew SYD-SIN and had a reasonably comfortable ...
3496,British Airways customer review,J Henson (United Kingdom) 14th September 2014,Madrid-London on 9 September on Boeing 767 air...
3497,British Airways customer review,Paula Loguda (United States) 14th September ...,9/8/14 LHR-EWR in WC. Clean aircraft but moder...
3498,British Airways customer review,B Kennedy (United Kingdom) 14th September 2014,Flew London City to Malaga return 28th August ...


In [25]:
df.to_csv("data/3500_BA_Reviews_Cleaned.csv")

In [34]:
# Drop unecessary columns
data = df.drop(["Review", "User Info"], axis=1)
data

Unnamed: 0,Review_Details
0,The worst business class experience. Ground cr...
1,Quite possibly the worst business class I have...
2,I will never be flying with BA again. This is ...
3,"On the my trip to Mexico City, I had the oppor..."
4,I upgraded at check in to Club Europe seat 1D ...
...,...
3495,Flew SYD-SIN and had a reasonably comfortable ...
3496,Madrid-London on 9 September on Boeing 767 air...
3497,9/8/14 LHR-EWR in WC. Clean aircraft but moder...
3498,Flew London City to Malaga return 28th August ...


In [35]:
data.to_csv("data/3500_BA_Reviews_Cleaned_Dropped.csv", index=False)