In [1]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time

In [3]:
# Define the CSV file path
csv_file = 'scraped_reviews2.csv'

In [5]:
def check_connection(url):
    """
    Check if we can establish a connection to the specified URL.

    Args:
    - url (str): The URL to check the connection.

    Returns:
    - bool: True if connection is successful, False otherwise.
    """
    try:
        response = requests.head(url)
        return response.status_code == 200
    except requests.ConnectionError:
        return False

In [7]:
def fetch_reviews(page_number, offset):
    """
    Fetch reviews from the airline quality website for a specific page number.
    
    Args:
    - page_number (int): The page number to fetch.
    - offset (int): The number of reviews to fetch per request.

    Returns:
    - list: A list of dictionaries containing the review data.
    """
    url = f"https://www.airlinequality.com/airline-reviews/ethiopian-airlines/page/{page_number}/?sortby=post_date%3ADesc&pagesize={offset}"
    
    print(f"Fetching reviews from page {page_number} with offset: {offset}.")
    
    try:
        # Send a GET request to fetch webpage content
        response = requests.get(url)
        response.raise_for_status()

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        reviews = soup.find_all('article', itemprop='review')

        if not reviews:
            print("No reviews found on this page.")
            return []

        # Initialize a list to store extracted data
        data = []

        for review in reviews:
            rating = review.find('span', itemprop='ratingValue')
            rating = rating.text.strip() if rating else "N/A"

            author = review.find('span', itemprop='name')
            author = author.text.strip() if author else "N/A"

            heading_service = review.find('h2', class_='text_header')
            heading_service = heading_service.text.strip() if heading_service else "N/A"

            author_span = soup.find('span', itemprop='author')
            location_text = author_span.next_sibling.strip() if author_span else "N/A"

            date = review.find('time', itemprop='datePublished')
            date = date['datetime'] if date else "N/A"

            review_body = review.find('div', class_='text_content', itemprop='reviewBody')
            review_body = review_body.text.strip() if review_body else "N/A"

            # Initialize additional review details
            type_traveller = date_flown = seat_type = route = recommended = "N/A"
            seat_comfort = Cabin_Staff_Service = food_rating = entertainment_rating = ground_service_rating = value_for_money = "N/A"

            stats = review.find('div', class_='review-stats')
            if stats:
                rows = stats.find_all('tr')
                for row in rows:
                    header = row.find('td', class_='review-rating-header')
                    value = row.find('td', class_='review-value')
                    stars = row.find('td', class_='review-rating-stars stars')

                    if header:
                        header_text = header.text.strip().lower()
                        if "type of traveller" in header_text:
                            type_traveller = value.text.strip() if value else "N/A"
                        elif "seat type" in header_text:
                            seat_type = value.text.strip() if value else "N/A"
                        elif "route" in header_text:
                            route = value.text.strip() if value else "N/A"
                        elif "recommended" in header_text:
                            recommended = value.text.strip() if value else "N/A"
                        elif "date flown" in header_text:
                            date_flown = value.text.strip() if value else "N/A"

                        if stars:
                            star_count = len(stars.find_all('span', class_='star fill'))
                            if "food & beverages" in header_text:
                                food_rating = star_count
                            elif "inflight entertainment" in header_text:
                                entertainment_rating = star_count
                            elif "ground service" in header_text:
                                ground_service_rating = star_count
                            elif "value for money" in header_text:
                                value_for_money = star_count
                            elif "cabin staff service" in header_text:
                                Cabin_Staff_Service = star_count
                            elif "seat comfort" in header_text:
                                seat_comfort = star_count

            # Append extracted data to the list
            data.append({
                'Rating': rating,
                'Author': author,
                'Author Location': location_text,
                'Review Date': date,
                'Review Title': heading_service,
                'Review': review_body,
                'Type of Traveller': type_traveller,
                'Seat Type': seat_type,
                'Route': route,
                'Date Flown': date_flown,
                'Seat Comfort': seat_comfort,
                'Cabin Staff Service': Cabin_Staff_Service,
                'Food & Beverages': food_rating,
                'Inflight Entertainment': entertainment_rating,
                'Ground Service': ground_service_rating,
                'Value for Money': value_for_money,
                'Recommended Service': recommended
            })

        return data

    except requests.exceptions.RequestException as e:
        print(f"Error fetching page {page_number}: {e}")
        return []

In [9]:
def check_data_added_to_dataframe(reviews_df):
    """
    Check if data is added to the DataFrame and print the first 10 rows.

    Args:
    - reviews_df (DataFrame): The DataFrame to check.
    """
    if not reviews_df.empty:
        print("\nData has been successfully added to the DataFrame.")
        print("Here are the first 10 rows:")
        print(reviews_df.head(10))
    else:
        print("\nNo data has been added to the DataFrame.")

In [11]:
def perform_data_quality_checks(reviews_df, stage='Initial'):
    """
    Perform data quality checks on the DataFrame.

    Args:
    - reviews_df (DataFrame): The DataFrame to check.
    - stage (str): Stage of the data (Initial or After Cleaning).
    """
    print(f"\n--- Data Quality Checks at {stage} Stage ---")
    firstTen = reviews_df.head(10)
    print(firstTen)
    
    print("\n\nData Summary:")
    print(reviews_df.info())
    
    print("\n\nStatistical Summary:")
    print(reviews_df.describe())

    #  Check unique values in 'Date' and 'Date Flown' for diagnosis
    print(reviews_df['Date Flown'].unique())
    print(reviews_df['Review Date'].unique())

    # Check the shape of the DataFrame (number of rows and columns)
    numRows, numColumns = reviews_df.shape
    numCells = numRows * numColumns
    print(f'\nNumber of rows: {numRows} \nNumber of columns: {numColumns} \nTotal number of cells: {numCells}')

    # Check null values and Calculate the total number of null values
    nullValues = reviews_df.isnull().sum()
    totalNulls = nullValues.sum()
    print(f'\nTotal number of null values:\n{nullValues}')
    print(f'Total number of null values: {totalNulls}')

    # Check Duplicates
    checkDuplicates = reviews_df.duplicated().sum()
    print(f'Total number of duplicate rows: {checkDuplicates}')

In [13]:
def clean_data(reviews_df):
    """
    Clean the DataFrame containing review data.

    Args:
    - reviews_df (DataFrame): The DataFrame to clean.

    Returns:
    - DataFrame: The cleaned DataFrame.
    """

    # Convert relevant columns to numeric, coercing errors to NaN
    columns_to_convert = ['Rating', 'Food & Beverages', 'Inflight Entertainment', 
                          'Seat Comfort', 'Cabin Staff Service', 'Ground Service']
    for column in columns_to_convert:
        reviews_df[column] = pd.to_numeric(reviews_df[column], errors='coerce')

    # Fill NaN values with zeros
    reviews_df[columns_to_convert] = reviews_df[columns_to_convert].fillna(0)

    # Drop all rows with any null values initially
    reviews_df = reviews_df.dropna()

    # Format data from text to DateTime data type
    reviews_df['Date Flown'] = pd.to_datetime(reviews_df['Date Flown'], format='%B %Y', errors='coerce')
    reviews_df['Review Date'] = pd.to_datetime(reviews_df['Review Date'], errors='coerce')

    # Drop rows with NaT values in the date columns after conversion
    reviews_df = reviews_df.dropna(subset=['Date Flown', 'Review Date'])

    # Remove quotation marks
    reviews_df['Review Title'] = reviews_df['Review Title'].str.replace('“', '').str.replace('”', '').str.strip('"')
    reviews_df['Author Location'] = reviews_df['Author Location'].str.replace('(', '').str.replace(')', '').str.strip(')')

    # Remove duplicates based on certain criteria
    reviews_df.drop_duplicates(subset=['Author', 'Review'], keep='first', inplace=True)
    
    return reviews_df

In [21]:
def save_to_csv(data):
    """
    Save a cleaned DataFrame to a CSV file.

    Args:
    - data (DataFrame): The cleaned DataFrame to save.
    """
    if not data.empty:
        if os.path.exists(csv_file):
            data.to_csv(csv_file, mode='a', index=False, header=False)
        else:
            data.to_csv(csv_file, index=False)
        print(f"Saved {len(data)} reviews to '{csv_file}'.")
    else:
        print("No data to save.")

In [25]:
def main():
    url = 'https://www.airlinequality.com/airline-reviews/ethiopian-airlines/'
    if check_connection(url):
        print("Connection successful. Proceeding to fetch data...")
        all_reviews = []
        for i in range(1, 8):
            reviews_data = fetch_reviews(i, 100)
            all_reviews.extend(reviews_data)
            time.sleep(5)

        if all_reviews:
            reviews_df = pd.DataFrame(all_reviews)
            
            # Check if data is added to the DataFrame
            check_data_added_to_dataframe(reviews_df)

            # Perform initial data quality checks
            perform_data_quality_checks(reviews_df, stage='Initial')

            # Clean the data
            cleaned_df = clean_data(reviews_df)

            # Check if cleaned data is added to the DataFrame
            check_data_added_to_dataframe(cleaned_df)

            # Perform data quality checks after cleaning
            perform_data_quality_checks(cleaned_df, stage='After Cleaning')

            # Save the cleaned data to CSV
            save_to_csv(cleaned_df)

    else:
        print("Failed to connect to the URL. Please check your connection or URL.")

if __name__ == "__main__":
    main()

Connection successful. Proceeding to fetch data...
Fetching reviews from page 1 with offset: 100.
Fetching reviews from page 2 with offset: 100.
Fetching reviews from page 3 with offset: 100.
Fetching reviews from page 4 with offset: 100.
Fetching reviews from page 5 with offset: 100.
Fetching reviews from page 6 with offset: 100.
Fetching reviews from page 7 with offset: 100.

Data has been successfully added to the DataFrame.
Here are the first 10 rows:
  Rating           Author  Author Location        Date  \
0      1     Bart Johnson  (United States)  2025-01-05   
1      2       H Thornton  (United States)  2025-01-03   
2      3            I Lee  (United States)  2024-12-17   
3      1      Brenda Acan  (United States)  2024-12-16   
4      3  Jillian Saltpaw  (United States)  2024-12-15   
5      1         E Basano  (United States)  2024-12-04   
6      4         M Larsen  (United States)  2024-12-01   
7      1          S Jiang  (United States)  2024-12-01   
8      1        Ga

In [None]:
import pandas as pd
import os

In [28]:
# File name of the CSV
fileName = 'scraped_reviews2.csv'

In [30]:
# Check if the file exists
if os.path.isfile(fileName):
    print(f"'{fileName}' does exist.") 
else:
    print(f"'{fileName}' does not exist.")

'scraped_reviews2.csv' does exist.


In [35]:
#Read CSV file
OpenFile = pd.read_csv(fileName)
FirstTen = OpenFile.head(20)
FirstTen

Unnamed: 0,Rating,Author,Author Location,Date,Heading,Review,Type of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Value for Money,Recommended
0,1.0,Bart Johnson,United States,2025-01-05,Not the best customer service,✅ Trip Verified | Not the best customer serv...,Couple Leisure,Economy Class,Monrovia to Washington via Addas Ababa,2025-01-01,1.0,3.0,1.0,0.0,2.0,2,no
1,2.0,H Thornton,United States,2025-01-03,Worst business class experience,✅ Trip Verified | Worst business class experie...,Business,Business Class,Addis Ababa to Toronto via Rome,2025-01-01,2.0,1.0,1.0,1.0,2.0,3,no
2,3.0,I Lee,United States,2024-12-17,the toilets became dirty,✅ Trip Verified | Towards the end of the fli...,Business,Economy Class,Addis Ababa to Nairobi,2024-12-01,1.0,1.0,3.0,0.0,3.0,3,no
3,1.0,Brenda Acan,United States,2024-12-16,Worst service ever,✅ Trip Verified | Worst service ever. Beside d...,Couple Leisure,Economy Class,Dubai to Zanzibar via Ethiopia,2024-12-01,1.0,1.0,1.0,1.0,1.0,1,no
4,3.0,Jillian Saltpaw,United States,2024-12-15,service fell painfully short,✅ Trip Verified | My recent journey from Was...,Couple Leisure,Economy Class,Washington IAD to Johannesburg,2024-12-01,2.0,2.0,1.0,3.0,1.0,1,no
5,1.0,E Basano,United States,2024-12-04,Ethiopian Airlines needs to do better,✅ Trip Verified | I just landed from an Ethi...,Solo Leisure,Economy Class,Addis Ababa to Washington via Lome,2024-11-01,2.0,2.0,1.0,1.0,2.0,1,no
6,4.0,M Larsen,United States,2024-12-01,I felt like I was back in the 90's,✅ Trip Verified | HKG-BKK-ADD-LAD-ADD-VIE-CPH ...,Solo Leisure,Economy Class,Hong Kong to Copenhagen via Luanda,2024-11-01,2.0,3.0,3.0,2.0,2.0,3,no
7,1.0,S Jiang,United States,2024-12-01,expected much more,✅ Trip Verified | Service was not there at a...,Couple Leisure,Economy Class,Tokyo to Seoul,2024-12-01,2.0,1.0,2.0,0.0,5.0,1,no
8,1.0,Gala Pate,United States,2024-11-29,the most uncomfortable flight,Not Verified | It was the most uncomfortable f...,Family Leisure,Economy Class,Atlanta GA to Lusaka Zambia via Addis Ababa,2023-12-01,1.0,1.0,1.0,1.0,1.0,1,no
9,4.0,R Adamski,United States,2024-11-01,didn’t like the flight or the service,✅ Trip Verified | The staff had a rather avera...,Business,Business Class,Hong Kong to Bangkok,2024-08-01,3.0,1.0,3.0,3.0,3.0,3,no
