In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#import numpy as np # linear algebra
#import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
# Import required libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import unicodedata
from datetime import datetime

#### Incremental Ingestion

In [6]:
#Get DB credentials
from dotenv import load_dotenv
from pathlib import Path
import os

dotenv_path = Path('db_credentials.env')
load_dotenv(dotenv_path=dotenv_path)

DB_HOST = os.getenv('DB_HOST')
DB_PORT = os.getenv('DB_PORT')
DB_NAME = os.getenv('DB_NAME')
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')
ETL_DATE = os.getenv('ETL_DATE')

In [8]:
import psycopg2

# Connect to the PostgreSQL database
connection = psycopg2.connect(
    dbname=DB_NAME,
    user=DB_USER,
    password=DB_PASSWORD,
    host=DB_HOST,
    port=DB_PORT
)

cursor = connection.cursor()

try:
    cursor.execute('SELECT MAX("Review Date") FROM silver_airline_quality_reviews;')
    
    max_date = cursor.fetchone()[0]
    
    print("The maximum date is:", max_date)

except (Exception, psycopg2.DatabaseError) as error:
    print(f"Error: {error}")
    
finally:
    cursor.close()
    connection.close()

The maximum date is: 2025-03-28 00:00:00


In [9]:
# For Incremental Ingestion (enabled if ETL_DATE set to 'CURRENT_DATE')
GET_REVIEWS_FROM_DATE = pd.to_datetime(max_date)

if GET_REVIEWS_FROM_DATE is None:
    GET_REVIEWS_FROM_DATE = pd.to_datetime('2002-01-01')
    
if ETL_DATE == 'CURRENT_DATE':
    ETL_DATE = datetime.today().strftime('%Y%m%d')

print(GET_REVIEWS_FROM_DATE)
print(ETL_DATE)

2025-03-28 00:00:00
20250403


#### Airline Quality Reviews Web Scraping

In [13]:
# Initialize an empty list to store the airline names
airline_names = []

# Set custom User-Agent in the headers to mimic a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36"
}

# Loop through each letter of the alphabet (from A to Z)
for letter in range(ord('A'), ord('Z') + 1):
    l = chr(letter)  # Get the character corresponding to the current ASCII code (e.g., 'A' for 65, 'B' for 66, etc.)

    # Build the URL for the specific letter to fetch airline names starting with that letter
    url = f"https://www.airlinequality.com/review-pages/a-z-airline-reviews/#a2z-ldr-{l}"

    # Send an HTTP GET request to fetch the webpage's HTML content using the custom User-Agent in the headers
    html_text = requests.get(url, headers=headers).text

    # Parse the HTML content using BeautifulSoup with the "lxml" parser
    soup = BeautifulSoup(html_text, "lxml")

    # Create a unique identifier (URI) to locate the container for the current letter
    uri = f"a2z-ldr-{l}"

    # Find the container that holds the list of airlines starting with the current letter
    container = soup.find("div", {"id": uri})

    # Loop through each list item in the container and extract the airline name
    for items in container.find_all("li"):
        airline_names.append(items.text)

In [14]:
#airline_names

In [15]:
# to convert names of airlines into url
start = "https://www.airlinequality.com/airline-reviews/"
end = "?sortby=post_date%3ADesc&pagesize=100"
airline_url = []
for items in airline_names:
    #converting names to lower case
    items = items.lower()
    # Replace special characters with their ASCII equivalents
    items = unicodedata.normalize('NFKD', items).encode('ASCII', 'ignore').decode('utf-8')
    # Replacing space with hypen
    items = items.replace(" ", "-")
    #creating URL
    airline_url.append(start + items + "/"+end)

In [16]:
#airline_url

In [17]:
df_airline = pd.DataFrame({"Name":airline_names,"Links":airline_url})

#### Option to Get for a specific airline only

In [19]:
df_airline_filtered = df_airline
#df_airline_filtered = df_airline[df_airline['Name'].str.contains('West Jet')]

df_airline_filtered

Unnamed: 0,Name,Links
0,AB Aviation,https://www.airlinequality.com/airline-reviews...
1,Adria Airways,https://www.airlinequality.com/airline-reviews...
2,Aegean Airlines,https://www.airlinequality.com/airline-reviews...
3,Aer Lingus,https://www.airlinequality.com/airline-reviews...
4,Aero VIP,https://www.airlinequality.com/airline-reviews...
...,...,...
567,Yangon Airways,https://www.airlinequality.com/airline-reviews...
568,Yemenia,https://www.airlinequality.com/airline-reviews...
569,Yeti Airlines,https://www.airlinequality.com/airline-reviews...
570,Zambia Airways,https://www.airlinequality.com/airline-reviews...


#### Add review image urls

In [21]:
columns = ["Aircraft","Type Of Traveller","Seat Type","Route","Date Flown","Seat Comfort","Cabin Staff Service","Food & Beverages",\
           "Ground Service","Inflight Entertainment","Wifi & Connectivity","Value For Money","Recommended","Top Review Image Url"]

df_columns = ["Airline Name","Overall_Rating","Review_Title","Review Date","Verified","Review","Top Review Image Url","Aircraft",\
              "Type Of Traveller","Seat Type","Route","Date Flown","Seat Comfort","Cabin Staff Service","Food & Beverages","Ground Service",\
              "Inflight Entertainment","Wifi & Connectivity","Value For Money","Recommended"]

In [22]:
reviews = []

# Loop through each row in the 'df_airline' DataFrame
for index, row in df_airline_filtered.iterrows():
    # Send an HTTP GET request to fetch the HTML content of the airline's review page
    html = requests.get(row['Links'], headers=headers).text
    bs = BeautifulSoup(html, "html.parser")

    # Find the container that holds the airline's review information
    container = bs.find("article", {"class": "comp comp_reviews-airline querylist position-content"})

    # Check if the container exists (i.e., the review page is valid)
    if container:
        # Extract the airline's name from the 'Name' column of the DataFrame
        print(row['Name'])

        # Loop through each review article in the container and extract relevant details
        for items in container.find_all("article"):
            verified = False
            # Extract the overall rating from the review
            rating = items.find("div", {"class": "rating-10"})
            #if rating:
            #    rating = rating.text.strip()[:1]
            #else:
            #    rating = None
            if rating and rating.text.strip()[:2] == '10':
                rating = '10'
            else:
                rating = rating.text.strip()[:1] if rating and rating.text else None

            # Extract the review title from the review
            title = items.find("h2")
            if title:
                title = title.text
            else:
                title = None

            # Extract the review date from the review
            time = items.find("h3").find("time")
            if time:
                time = time.text
            else:
                time = None

            if time is None or pd.to_datetime(time) <= GET_REVIEWS_FROM_DATE:
                #print(f'{time}: Skipping to next review')
                continue
                
            # Extract the review content from the review
            text = items.find("div", {"class": "text_content"}).text
            text = text.split("|")
            if len(text) == 1:
                review = text[0]
            else:
                if text[0] == '✅ Trip Verified ':
                    verified = True
                review = text[1]

            # Extract the review image url from the review
            first_div = items.find("div", {"class": "large_review_img"})
            if first_div and first_div.img:
                first_image_src = first_div.img['src']
                #print(first_image_src)
            else:
                #print("No image found.")
                first_image_src = None
                
            # Extract ratings for specific categories from the review
            table = items.find("table")
            tab = [None] * 13
            for item in table.find_all("tr"):
                i = 0
                for td in item.find_all("td"):
                    if i == 0:
                        condition = td.text
                        # Finding index of the given condition in the 'columns' list
                        ind = columns.index(condition)
                        i = 1
                    else:
                        # Checking if it's a rating or not
                        if td.find("span") is None:
                            value = td.text
                            tab[ind] = value
                        # In case it's a rating, counting stars filled (i.e., stars given)
                        else:
                            value = 0
                            for star in td.find_all("span", {"class": "star fill"}):
                                value += 1
                            tab[ind] = value

            # Store the extracted review data in a list named 'data'
            data = [row['Name'], rating, title, time, verified, review, first_image_src] + tab
            reviews.append(data)

# The 'reviews' list now contains all the extracted review data for different airlines.

AB Aviation
Adria Airways
Aegean Airlines
Aer Lingus
Aero VIP
Aerocaribbean
Aeroflot Russian Airlines
AeroItalia
Aerolineas Argentinas
Aeromar
Aeromexico
Aerosur
Africa World Airlines
Afriqiyah Airways
Aigle Azur
Air Algerie
Air Antilles
Air Arabia
Air Astana
Air Austral
Air Bagan
Air Belgium
Air Berlin
Air Botswana
Air Burkina
Air Busan
Air Cairo
Air Canada
Air Canada rouge
Air Caraibes
Air China
Air Corsica
Air Costa
Air Cote d'Ivoire
Air Djibouti
Air Dolomiti
Air Europa
Air France
Air Greenland
Air Iceland Connect
Air India
Air India Express
Air Italy
Air Juan
Air KBZ
Air Koryo
Air Labrador
Air Macau
Air Madagascar
Air Malawi
Air Malta
Air Mauritius
Air Mediterranee
Air Memphis
Air Moldova
Air Montenegro
Air Namibia
Air New Zealand
Air Niugini
Air North Yukon's Airline
Air Nostrum
Air Panama
Air Pegasus
Air Premia
Air Rarotonga
Air Senegal
Air Serbia
Air Seychelles
Air Tahiti Nui
Air Tanzania
Air Transat
Air Zimbabwe
AirAsia
AirAsia India
AirAsia Philippines
AirAsia X
AirAsia Zest
a

In [23]:
#reviews

In [24]:
#creating reviews dataframe
df = pd.DataFrame(reviews, columns=df_columns)

In [25]:
df.shape

(59, 20)

In [26]:
df.sample(10)

Unnamed: 0,Airline Name,Overall_Rating,Review_Title,Review Date,Verified,Review,Top Review Image Url,Aircraft,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity,Value For Money,Recommended
9,Bluebird Airways,1,"""do not recommend this airline""",1st April 2025,True,Terrible experience with BlueBird Airways! ...,https://www.airlinequality.com/wp-content/uplo...,,Couple Leisure,Economy Class,Rome to Tel Aviv,March 2025,3.0,3.0,1.0,1.0,1.0,1.0,2,no
24,Frontier Airlines,1,"""the worst airline out there""",3rd April 2025,False,Third time I've flown Frontier and once agai...,,,Couple Leisure,Economy Class,Chicago to Orlando,April 2025,1.0,1.0,1.0,1.0,1.0,1.0,1,no
1,Aeromexico,1,"""Very disappointed""",29th March 2025,True,Flew from Mexico City to Toronto March 2025...,,,Couple Leisure,Economy Class,Mexico city to Toronto,March 2025,1.0,3.0,1.0,5.0,3.0,,2,no
41,Royal Brunei Airlines,9,"""A long but pleasant flight""",30th March 2025,True,A long but pleasant flight. Cabin crew were...,,Boeing 787-8,Solo Leisure,Economy Class,London to Bandai Seri Begawan via Dubai,March 2025,5.0,5.0,5.0,4.0,4.0,,5,yes
48,Super Air Jet,1,"""ended up missing my flight""",30th March 2025,True,I had a frustrating and disappointing exper...,,,Solo Leisure,Economy Class,Jakarta to Lombok,March 2025,1.0,1.0,,1.0,,,1,no
34,KLM Royal Dutch Airlines,2,"""I would not recommend KLM""",30th March 2025,True,We travelled KLM Glasgow to Amsterdam. Outg...,,,Couple Leisure,Economy Class,Glasgow to Amsterdam,March 2025,3.0,4.0,3.0,2.0,1.0,1.0,1,no
56,Volotea,10,"""Good experience""",31st March 2025,True,Good experience. The flight assistants hav...,,,Family Leisure,Economy Class,Firenze to Cagliari,March 2025,5.0,5.0,,5.0,,5.0,5,yes
54,Volaris,1,"""think twice before booking with Volaris""",2nd April 2025,True,I wasn’t able to fly with Volaris on March ...,,,Family Leisure,Economy Class,Mexico City to Oakland,March 2025,,,,1.0,,,1,no
49,Thai Airways,1,"""The worst customer service""",30th March 2025,True,The worst customer service I’ve seen. There...,,,Solo Leisure,Economy Class,Singapore to Hong Kong via Bangkok,March 2025,3.0,1.0,2.0,1.0,2.0,,1,no
13,China Airlines,1,“the seat is super tight”,2nd April 2025,True,Very bad. Small plane and the seat is super t...,,,Couple Leisure,Economy Class,Taipei City to Singapore,April 2025,1.0,1.0,1.0,1.0,1.0,1.0,1,no


In [27]:
df.to_csv(f"data/bronze_{ETL_DATE}_Airline_Reviews_withImageUrls.csv")

In [28]:
df[df['Overall_Rating'] == '10'].head()

Unnamed: 0,Airline Name,Overall_Rating,Review_Title,Review Date,Verified,Review,Top Review Image Url,Aircraft,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity,Value For Money,Recommended
2,Air Canada rouge,10,"""showing if the bathroom is occupied""",29th March 2025,False,Flight was awesome. Staff was awesome. My is...,https://www.airlinequality.com/wp-content/uplo...,,Couple Leisure,Economy Class,Toronto to Dominican Republic,March 2025,5.0,5.0,5.0,5.0,,5.0,5,yes
12,British Airways,10,"""Very good service on this route""",31st March 2025,False,Very good service on this route BA2710 30th ...,,A320,Couple Leisure,Business Class,London Gatwick to Fuerteventura,March 2025,4.0,5.0,5.0,4.0,,,4,yes
56,Volotea,10,"""Good experience""",31st March 2025,True,Good experience. The flight assistants hav...,,,Family Leisure,Economy Class,Firenze to Cagliari,March 2025,5.0,5.0,,5.0,,5.0,5,yes
57,Volotea,10,"""I recommend this company""",31st March 2025,False,I appreciate that this company is opening ro...,,A320,Couple Leisure,Economy Class,Brest to Strasbourg,March 2025,5.0,5.0,,5.0,,,5,yes
