## Dependecy

In [1]:
# https://www.kaggle.com/code/notcostheta/skytrax-scraper

In [2]:
!pip install scrapy bs4

Collecting scrapy
  Downloading Scrapy-2.11.2-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting Twisted>=18.9.0 (from scrapy)
  Downloading twisted-24.7.0-py3-none-any.whl.metadata (18 kB)
Collecting cryptography>=36.0.0 (from scrapy)
  Downloading cryptography-43.0.1-cp39-abi3-win_amd64.whl.metadata (5.4 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.9.1-py2.py3-none-any.whl.metadata (11 kB)
Collecting pyOpenSSL>=21.0.0 (from scrapy)
  Downloading pyOpenSSL-24.2.1-py3-none-any.whl.metadata (13 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.7.0-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading s

## Find the URL to Reviews

### Function

In [6]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

def get_all_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.find_all('a', href=True)

    extracted_links = [link['href'] for link in links]
    base_url = urlparse(url)
    extracted_links = [urljoin(base_url.geturl(), link) for link in extracted_links]

    return extracted_links

### Airport

In [16]:
url = "https://www.airlinequality.com/review-pages/a-z-airport-reviews/"
all_links = get_all_links(url)
airport_set = set([l.strip("/") for l in all_links if "https://www.airlinequality.com/airport-reviews/" in l])
len(airport_set)

986

### Airline

In [15]:
url = "https://www.airlinequality.com/review-pages/a-z-airline-reviews/"
all_links = get_all_links(url)
airline_set = set([l.strip("/") for l in all_links if "https://www.airlinequality.com/airline-reviews/" in l])
len(airline_set)

568

### Temp Storage

In [17]:
import json

with open("airport_review_url.json", "w") as fp:
    json.dump(list(airport_set), fp, indent = 4)

with open("airline_review_url.json", "w") as fp:
    json.dump(list(airline_set), fp, indent = 4)

## Crawler

In [3]:
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 38
page_size = 100

In [4]:
import scrapy
from bs4 import BeautifulSoup

class ReviewsSpider(scrapy.Spider):
    name = 'reviews'
    start_urls = [f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}" for i in range(1, pages + 1)]

    def parse(self, response):
        soup = BeautifulSoup(response.text, 'html.parser')
        articles_class = soup.find_all("article", class_="comp comp_reviews-airline querylist position-content")
        articles = articles_class[0].find_all("article", itemprop="review")

        for article in articles:
            review = {}
            review["datePublished"] = article.find("meta", itemprop="datePublished")["content"]
            review["ratingValue"] = article.find("span", itemprop="ratingValue").text
            review["bestRating"] = article.find("span", itemprop="bestRating").text
            review["header"] = article.find("h2", class_="text_header").text
            review["author"] = article.find("span", itemprop="name").text
            review["reviewBody"] = article.find("div", itemprop="reviewBody").text.strip()
            review["recommended"] = (
                article.find("td", class_="review-value rating-yes").text
                if article.find("td", class_="review-value rating-yes")
                else None
            )

            review_stats = {}
            for row in article.find_all("tr"):
                header = row.find("td", class_="review-rating-header")
                if header:
                    key = header.text.strip()
                    value = row.find("td", class_="review-value")
                    if value:
                        review_stats[key] = value.text.strip()
                    else:
                        stars = row.find_all("span", class_="star fill")
                        review_stats[key] = len(stars)
            review["stats"] = review_stats

            yield review

In [None]:
%%capture

if __name__ == "__main__":
    from scrapy.crawler import CrawlerProcess
    import os

    os.makedirs("reviews", exist_ok = True)
    process = CrawlerProcess(settings={
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'FEED_FORMAT': 'json',
        'FEED_URI': 'reviews/british_airline.json'
    })

    process.crawl(ReviewsSpider)
    process.start()

2024-09-17 21:11:32 [scrapy.utils.log] INFO: Scrapy 2.11.2 started (bot: scrapybot)
2024-09-17 21:11:32 [scrapy.utils.log] INFO: Versions: lxml 5.3.0.0, libxml2 2.11.7, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.7.0, Python 3.10.14 | packaged by Anaconda, Inc. | (main, May  6 2024, 19:44:50) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 24.2.1 (OpenSSL 3.3.2 3 Sep 2024), cryptography 43.0.1, Platform Windows-10-10.0.19045-SP0
2024-09-17 21:11:32 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2024-09-17 21:11:33 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2024-09-17 21:11:33 [scrapy.extensions.telnet] INFO: Telnet Password: 19f07716c190330b
  exporter = cls(crawler)

2024-09-17 21:11:33 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.teln