In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import unicodedata


# Initialize an empty list to store the airline names
airline_names = []

# Set custom User-Agent in the headers to mimic a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36"
}

# Loop through each letter of the alphabet (from A to Z)
for letter in range(ord('A'), ord('Z') + 1):
    l = chr(letter)  # Get the character corresponding to the current ASCII code (e.g., 'A' for 65, 'B' for 66, etc.)

    # Build the URL for the specific letter to fetch airline names starting with that letter
    url = f"https://www.airlinequality.com/review-pages/a-z-airline-reviews/#a2z-ldr-{l}"

    # Send an HTTP GET request to fetch the webpage's HTML content using the custom User-Agent in the headers
    html_text = requests.get(url, headers=headers).text

    # Parse the HTML content using BeautifulSoup with the "lxml" parser
    soup = BeautifulSoup(html_text, "lxml")

    # Create a unique identifier (URI) to locate the container for the current letter
    uri = f"a2z-ldr-{l}"

    # Find the container that holds the list of airlines starting with the current letter
    container = soup.find("div", {"id": uri})

    # Loop through each list item in the container and extract the airline name
    for items in container.find_all("li"):
        airline_names.append(items.text)

# List of specific airline names of interest

#airlines_of_interest = ["American Airlines", "Delta Air Lines", "United Airlines", "Southwest Airlines", "Alaska Airlines"]
airlines_of_interest = ["Jet Airways", "Spirit Airlines", "Frontier Airlines", "Hawaiian Airlines"]

# To convert names of airlines into URLs
start = "https://www.airlinequality.com/airline-reviews/"
end = "?sortby=post_date%3ADesc&pagesize=7000"
airline_url = []

for items in airline_names:
    # Convert names to lowercase
    items = items.lower()

    # Replace special characters with their ASCII equivalents
    items = unicodedata.normalize('NFKD', items).encode('ASCII', 'ignore').decode('utf-8')

    # Replace space with hyphen
    items = items.replace(" ", "-")

    # Create URL
    airline_url.append(start + items + "/" + end)

df_airline = pd.DataFrame({"Name": airline_names, "Links": airline_url})

columns = ["Aircraft", "Type Of Traveller", "Seat Type", "Route", "Date Flown", "Seat Comfort",
           "Cabin Staff Service", "Food & Beverages", "Ground Service", "Inflight Entertainment",
           "Wifi & Connectivity", "Value For Money", "Recommended"]

df_columns = ["Airline Name", "Overall_Rating", "Review_Title", "Review Date", "Verified", "Review"] + columns

reviews = []

# Loop through each row in the 'df_airline' DataFrame
for index, row in df_airline.iterrows():
    # Check if the airline name is in the list of airlines you're interested in
    if row['Name'] in airlines_of_interest:
        # Send an HTTP GET request to fetch the HTML content of the airline's review page
        html = requests.get(row['Links'], headers=headers).text
        bs = BeautifulSoup(html, "html.parser")

        # Find the container that holds the airline's review information
        container = bs.find("article", {"class": "comp comp_reviews-airline querylist position-content"})

        # Check if the container exists (i.e., the review page is valid)
        if container:
            # Extract the airline's name from the 'Name' column of the DataFrame
            print(row['Name'])

            # Loop through each review article in the container and extract relevant details
            for items in container.find_all("article"):
                verified = False
                # Extract the overall rating from the review
                rating = items.find("div", {"class": "rating-10"})
                if rating:
                    rating = rating.text.strip()[:1]
                else:
                    rating = None

                # Extract the review title from the review
                title = items.find("h2")
                if title:
                    title = title.text
                else:
                    title = None

                # Extract the review date from the review
                time = items.find("h3").find("time")
                if time:
                    time = time.text
                else:
                    time = None

                # Extract the review content from the review
                text = items.find("div", {"class": "text_content"}).text
                text = text.split("|")
                if len(text) == 1:
                    reviews = text[0]
               


reviews = []

# Loop through each row in the 'df_airline' DataFrame
for index, row in df_airline.iterrows():
    # Check if the airline name is in the list of airlines you're interested in
    if row['Name'] in airlines_of_interest:
        # Send an HTTP GET request to fetch the HTML content of the airline's review page
        html = requests.get(row['Links'], headers=headers).text
        bs = BeautifulSoup(html, "html.parser")

        # Find the container that holds the airline's review information
        container = bs.find("article", {"class": "comp comp_reviews-airline querylist position-content"})

        # Check if the container exists (i.e., the review page is valid)
        if container:
            # Extract the airline's name from the 'Name' column of the DataFrame
            airline_name = row['Name']

            # Loop through each review article in the container and extract relevant details
            for items in container.find_all("article"):
                verified = False
                # Extract the overall rating from the review
                rating = items.find("div", {"class": "rating-10"})
                if rating:
                    rating = rating.text.strip()[:1]
                else:
                    rating = None

                # Extract the review title from the review
                title = items.find("h2")
                if title:
                    title = title.text
                else:
                    title = None

                # Extract the review date from the review
                time = items.find("h3").find("time")
                if time:
                    time = time.text
                else:
                    time = None

                # Extract the review content from the review
                text = items.find("div", {"class": "text_content"}).text
                text = text.split("|")
                if len(text) == 1:
                    review = text[0]
                else:
                    if text[0] == '✅ Trip Verified ':
                        verified = True
                    review = text[1]

                # Extract ratings for specific categories from the review
                table = items.find("table")
                tab = [None] * len(columns)
                for item in table.find_all("tr"):
                    i = 0
                    for td in item.find_all("td"):
                        if i == 0:
                            condition = td.text
                            # Finding index of the given condition in the 'columns' list
                            ind = columns.index(condition)
                            i = 1
                        else:
                            # Checking if it's a rating or not
                            if td.find("span") is None:
                                value = td.text
                                tab[ind] = value
                            # In case it's a rating, counting stars filled (i.e., stars given)
                            else:
                                value = 0
                                for star in td.find_all("span", {"class": "star fill"}):
                                    value += 1
                                tab[ind] = value

                # Store the extracted review data in a list named 'data'
                data = [airline_name, rating, title, time, verified, review] + tab
                reviews.append(data)

# Create a DataFrame from the 'reviews' list
df = pd.DataFrame(reviews, columns=df_columns)
df.to_csv("Airline_review.csv")


# Display the DataFrame
print(df)

Frontier Airlines
Hawaiian Airlines
Jet Airways
Spirit Airlines
            Airline Name Overall_Rating                        Review_Title  \
0      Frontier Airlines              1               "Absolutely horrific"   
1      Frontier Airlines              5               “I felt disrespected”   
2      Frontier Airlines              1  "I will never fly with them again"   
3      Frontier Airlines              1        "The worst airline possible"   
4      Frontier Airlines              1  "We will never fly Frontier again"   
...                  ...            ...                                 ...   
10178    Spirit Airlines              n     Spirit Airlines customer review   
10179    Spirit Airlines              n     Spirit Airlines customer review   
10180    Spirit Airlines              n     Spirit Airlines customer review   
10181    Spirit Airlines              1     Spirit Airlines customer review   
10182    Spirit Airlines              n     Spirit Airlines custome

In [5]:
df=pd.DataFrame()
df["reviews"]=reviews
df
df.to_csv("Airlines_reviews.csv")