# Webscrapping BA reviews along with ratings

In [2]:
# Import modules
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.subplots as sp

%matplotlib inline

In [12]:
# Initialize empty lists for reviews, ratings, and dates
reviews_all = []
ratings_all = []
dates_all = []
overall_rating = {
    
    'Overall_rating' : []
}

# Define the categories you're interested in
categories_of_interest = ["Type Of Traveller", "Seat Type", "Route","Seat Comfort", "Cabin Staff Service", "Food & Beverages", "Ground Service", "Wifi & Connectivity", "Value For Money", "Recommended"]

# Precompile URL
url_template = "https://www.airlinequality.com/airline-reviews/british-airways/page/{}/"

# Create a session
with requests.Session() as s:
    # Loop through page numbers from 1 to 138
    for page in range(1,138):
        try:
            # Send GET request and create soup object
            response = s.get(url_template.format(page))
            if response.status_code != 200:
                print(f"Error: GET request failed on page {page} with status code {response.status_code}")
                continue
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find all review blocks
            review_blocks = soup.find_all('div', {'class': 'tc_mobile'})
            if not review_blocks:
                print(f"Warning: No review blocks found on page {page}")
                continue

            for block in review_blocks:
                # Find the review text and append to list
                review_div = block.find('div', {'class': 'text_content'})
                if review_div is not None:
                    reviews_all.append(review_div.get_text())
                else:
                    print(f"Warning: Couldn't find review text on page {page}")

                # Find the ratings table and extract ratings
                ratings_table = block.find('table', {'class': 'review-ratings'})
                if ratings_table is not None:
                    # Initialize ratings with default values
                    ratings = {category: 'N/A' for category in categories_of_interest}
                    for row in ratings_table.find_all('tr'):
                        # The category is in the first td element
                        category_cell = row.find('td', {'class': 'review-rating-header'})
                        if category_cell is not None:
                            category = category_cell.get_text()
                        else:
                            print(f"Warning: Couldn't find category header on page {page}")
                            continue
                        if category in categories_of_interest:
                            # Check if the category is a rating or a text value
                            if "star fill" in str(row):
                                # The rating is the number of filled stars
                                rating = len(row.find_all('span', {'class': 'star fill'}))
                                ratings[category] = rating
                            else:
                                # The value is in the second td element
                                value_cell = row.find('td', {'class': 'review-value'})
                                if value_cell is not None:
                                    ratings[category] = value_cell.get_text()
                                else:
                                    print(f"Warning: Couldn't find review value on page {page} for category {category}")

                    # Append the ratings dictionary to the list
                    ratings_all.append(ratings)
                else:
                    print(f"Warning: Couldn't find ratings table on page {page}")

                # Extract datetime attribute from the parent HTML
                parent_html = block.find_parent('article')
                time_element = parent_html.find('time', {'itemprop': 'datePublished'})
                if time_element is not None:
                    dates_all.append(time_element['datetime'])
                else:
                    print(f"Warning: Couldn't find datetime on page {page}")
                
                # Extract overall rating
                overall_rating_element = parent_html.find('span', {'itemprop': 'ratingValue'})
                if overall_rating_element is not None:
                    overall_rating['Overall_rating'].append(overall_rating_element.get_text())
                else:
                    print(f"Warning: Couldn't find overall rating get text on page {page}")
                
                    
                

        except requests.exceptions.RequestException as e:
            print(f"An error occurred on page {page}: {str(e)}")

# Create a dataframe from the list of all reviews, ratings, and dates
df_all = pd.DataFrame(reviews_all, columns=["Review"])
dfov = pd.DataFrame(overall_rating)
dfrt = pd.DataFrame(ratings_all)
df_ratings = pd.concat([dfov,dfrt],axis=1)
df_dates = pd.DataFrame(dates_all, columns=["Date"])

In [13]:
df_merged = pd.concat([df_all,df_ratings,df_dates],axis = 1)

In [14]:
df_merged

Unnamed: 0,Review,Overall_rating,Type Of Traveller,Seat Type,Route,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Wifi & Connectivity,Value For Money,Recommended,Date
0,✅ Trip Verified | First time using BA busines...,9,Couple Leisure,Business Class,London to New York,5,5,5,5,,4,yes,2024-01-07
1,Not Verified | Extremely rude ground service....,6,Family Leisure,Economy Class,Rome to London,4,5,4,1,,2,no,2024-01-03
2,✅ Trip Verified | My son and I flew to Geneva...,1,Family Leisure,Business Class,Gatwick to Geneva,2,1,1,1,1,1,no,2024-01-02
3,✅ Trip Verified | For the price paid (bought ...,8,Solo Leisure,Business Class,Istanbul to London Heathrow,1,4,5,2,,4,yes,2023-12-29
4,✅ Trip Verified | Flight left on time and arr...,6,Solo Leisure,Economy Class,London Heathrow to Istanbul,3,2,1,5,,2,no,2023-12-29
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1365,✅ Trip Verified | London to Tel Aviv. I have...,1,Solo Leisure,Business Class,London to Tel Aviv,1,1,,1,,1,no,2018-01-01
1366,✅ Trip Verified | I flew Premium Economy from ...,3,Solo Leisure,Premium Economy,London to Cape Town,1,3,4,2,,1,no,2018-01-01
1367,✅ Trip Verified | Hong Kong to London before ...,9,Family Leisure,First Class,Hong Kong to London,4,5,5,4,,4,yes,2017-12-30
1368,✅ Trip Verified | Dublin to Heathrow. BA gets...,1,Couple Leisure,Business Class,Dublin to Heathrow,1,1,1,1,,1,no,2017-12-28


In [34]:
cols = list(df_merged.columns[1:-1])
cols.remove('Route')

for i in cols:
    print(f" Unique elements of {i} are : {df_merged[i].unique()}")

 Unique elements of Overall_rating are : ['9' '6' '1' '8' '2' '5' '3' '10' '4' '7']
 Unique elements of Type Of Traveller are : ['Couple Leisure' 'Family Leisure' 'Solo Leisure' 'Business' 'N/A']
 Unique elements of Seat Type are : ['Business Class' 'Economy Class' 'Premium Economy' 'First Class']
 Unique elements of Seat Comfort are : [5 4 2 1 3 'N/A']
 Unique elements of Cabin Staff Service are : [5 1 4 2 3 'N/A']
 Unique elements of Food & Beverages are : [5 4 1 2 'N/A' 3]
 Unique elements of Ground Service are : [5 1 2 3 4 'N/A']
 Unique elements of Wifi & Connectivity are : ['N/A' 1 2 5 3 4]
 Unique elements of Value For Money are : [4 2 1 3 5]
 Unique elements of Recommended are : ['yes' 'no']


In [39]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1370 entries, 0 to 1369
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Review               1370 non-null   object
 1   Overall_rating       1370 non-null   object
 2   Type Of Traveller    1370 non-null   object
 3   Seat Type            1370 non-null   object
 4   Route                1370 non-null   object
 5   Seat Comfort         1370 non-null   object
 6   Cabin Staff Service  1370 non-null   object
 7   Food & Beverages     1370 non-null   object
 8   Ground Service       1370 non-null   object
 9   Wifi & Connectivity  1370 non-null   object
 10  Value For Money      1370 non-null   int64 
 11  Recommended          1370 non-null   object
 12  Date                 1370 non-null   object
dtypes: int64(1), object(12)
memory usage: 139.3+ KB


In [36]:
df_merged.to_csv('Recent_data_collected/ BA_reviews_uncleaned.csv',index=False)