## Dependecy

In [1]:
# https://www.kaggle.com/code/notcostheta/skytrax-scraper

In [2]:
!pip install scrapy bs4

Collecting scrapy
  Downloading Scrapy-2.11.2-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting Twisted>=18.9.0 (from scrapy)
  Downloading twisted-24.7.0-py3-none-any.whl.metadata (18 kB)
Collecting cryptography>=36.0.0 (from scrapy)
  Downloading cryptography-43.0.1-cp39-abi3-win_amd64.whl.metadata (5.4 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.9.1-py2.py3-none-any.whl.metadata (11 kB)
Collecting pyOpenSSL>=21.0.0 (from scrapy)
  Downloading pyOpenSSL-24.2.1-py3-none-any.whl.metadata (13 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.7.0-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading s

## Find the URL to Reviews

### Function

In [6]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

def get_all_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.find_all('a', href=True)

    extracted_links = [link['href'] for link in links]
    base_url = urlparse(url)
    extracted_links = [urljoin(base_url.geturl(), link) for link in extracted_links]

    return extracted_links

### Airport

In [16]:
url = "https://www.airlinequality.com/review-pages/a-z-airport-reviews/"
all_links = get_all_links(url)
airport_set = set([l.strip("/") for l in all_links if "https://www.airlinequality.com/airport-reviews/" in l])
len(airport_set)

986

### Airline

In [15]:
url = "https://www.airlinequality.com/review-pages/a-z-airline-reviews/"
all_links = get_all_links(url)
airline_set = set([l.strip("/") for l in all_links if "https://www.airlinequality.com/airline-reviews/" in l])
len(airline_set)

568

### Temp Storage

In [17]:
import json

with open("airport_review_url.json", "w") as fp:
    json.dump(list(airport_set), fp, indent = 4)

with open("airline_review_url.json", "w") as fp:
    json.dump(list(airline_set), fp, indent = 4)

## Crawl TD (Airport)

In [4]:
import requests
from bs4 import BeautifulSoup

url = "https://www.bts.gov/topics/airlines-and-airports/world-airport-codes"

response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')

table

<table border="0" cellpadding="0" cellspacing="0" style="width:500px;" width="128"><colgroup><col span="2"/></colgroup><thead><tr height="40"><th height="40" scope="col" style="height: 40px; width: 64px;">Code</th><th scope="col" style="width: 64px;">City: Airport</th></tr></thead><tbody><tr height="19"><td height="19" style="height:19px;">01A</td><td>Afognak Lake, AK: Afognak Lake Airport</td></tr><tr height="19"><td height="19" style="height:19px;">03A</td><td>Granite Mountain, AK: Bear Creek Mining Strip</td></tr><tr height="19"><td height="19" style="height:19px;">04A</td><td>Lik, AK: Lik Mining Camp</td></tr><tr height="19"><td height="19" style="height:19px;">05A</td><td>Little Squaw, AK: Little Squaw Airport</td></tr><tr height="19"><td height="19" style="height:19px;">06A</td><td>Kizhuyak, AK: Kizhuyak Bay</td></tr><tr height="19"><td height="19" style="height:19px;">07A</td><td>Klawock, AK: Klawock Seaplane Base</td></tr><tr height="19"><td height="19" style="height:19px;">08A

In [5]:
headers = [header.text for header in table.find_all('th')]
headers

['Code', 'City: Airport']

In [11]:
rows = []
for row in table.find_all('tr'):
    rows.append([data.text for data in row.find_all('td')])
rows[1:]

[['01A', 'Afognak Lake, AK: Afognak Lake Airport'],
 ['03A', 'Granite Mountain, AK: Bear Creek Mining Strip'],
 ['04A', 'Lik, AK: Lik Mining Camp'],
 ['05A', 'Little Squaw, AK: Little Squaw Airport'],
 ['06A', 'Kizhuyak, AK: Kizhuyak Bay'],
 ['07A', 'Klawock, AK: Klawock Seaplane Base'],
 ['08A', 'Elizabeth Island, AK: Elizabeth Island Airport'],
 ['09A', 'Homer, AK: Augustin Island'],
 ['1B1', 'Hudson, NY: Columbia County'],
 ['1G4', 'Peach Springs, AZ: Grand Canyon West'],
 ['1N7', 'Blairstown, NJ: Blairstown Airport'],
 ['1NY', 'Penn Yan, NY: Penn Yan Airport'],
 ['2NY', 'Port Washington, NY: Sands Point Seaplane Base'],
 ['6B0', 'Middlebury, VT: Middlebury State'],
 ['7AK', 'Akun, AK: Akun Airport'],
 ['8F3', 'Crosbyton, TX: Crosbyton Municipal'],
 ['A01', 'Fairbanks/Ft. Wainwright, AK: Blair Lake'],
 ['A02', 'Deadmans Bay, AK: Deadmans Bay Airport'],
 ['A03', 'Hallo Bay, AK: Hallo Bay Airport'],
 ['A04', 'Red Lake, AK: Red Lake Airport'],
 ['A05', 'Shell Lake, AK: Shell Lake Airpo

In [13]:
import csv

with open('airport_code_name.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(headers)
    csvwriter.writerows(rows[1:]) 

## Crawl TD (Airline)

In [21]:
import requests
from bs4 import BeautifulSoup
import csv

url = "https://www.bts.gov/topics/airlines-and-airports/airline-codes"

response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table').find('table')

headers = [header.text for header in table.find_all('th')]

rows = []
for row in table.find_all('tr'):
    rows.append([data.text for data in row.find_all('td')])
print(rows[0])
print(rows[1:])


with open('airline_code_name.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(rows[0])
    csvwriter.writerows(rows[1:])


['CARRIER', 'CARRIERNAME']
[['02Q', 'Titan Airways'], ['04Q', 'Tradewind Aviation'], ['06Q', 'Master Top Linhas Aereas Ltd.'], ['07Q', 'Flair Airlines Ltd.'], ['09Q', 'Swift Air, LLC d/b/a Eastern Air Lines d/b/a Eastern'], ['0BQ', 'DCA'], ['0CQ', 'ACM AIR CHARTER GmbH'], ['0FQ', 'Maine Aviation Aircraft Charter, LLC'], ['0HQ', 'Polar Airlines de Mexico d/b/a Nova Air'], ['0J', 'JetClub AG'], ['0LQ', 'Metropix UK, LLP.'], ['0Q', 'Flying Service N.V.'], ['0QQ', 'TAG Aviation (UK) Ltd.'], ['0RQ', 'TAG Aviation Espana S.L.'], ['0UQ', 'Comlux Malta, Ltd.'], ['0VQ', 'Ocean Sky (UK) Limited'], ['0YQ', 'Comlux Malta Ltd.'], ['10Q', 'Swiss Air Ambulance'], ['12Q', 'Unijet'], ['13Q', 'Chartright Air Inc.'], ['14Q', 'London Air Services Limited'], ['15Q', 'Air Alsie A/S'], ['17Q', 'Albinati Aeronautics SA'], ['1AQ', 'Via Airlines d/b/a Sterling Airways'], ['1BQ', 'Eastern Airlines f/k/a Dynamic Airways, LLC'], ['1EQ', 'KaiserAir, Inc.'], ['1HQ', 'International Jet Management GmbH'], ['1IQ', 'Jet

"\nwith open('airline_code_name.csv', 'w', newline='') as csvfile:\n    csvwriter = csv.writer(csvfile)\n    csvwriter.writerow(headers)\n    csvwriter.writerows(rows[1:])\n"

In [17]:
table

<table>
<tbody>
<tr>
<td><span><span><span><span><span><span>CARRIER</span></span></span></span></span></span></td>
<td><span><span><span><span><span><span>CARRIERNAME</span></span></span></span></span></span></td>
</tr>
<tr>
<td><span><span><span><span><span><span>02Q</span></span></span></span></span></span></td>
<td><span><span><span><span><span><span>Titan Airways</span></span></span></span></span></span></td>
</tr>
<tr>
<td><span><span><span><span><span><span>04Q</span></span></span></span></span></span></td>
<td><span><span><span><span><span><span>Tradewind Aviation</span></span></span></span></span></span></td>
</tr>
<tr>
<td><span><span><span><span><span><span>06Q</span></span></span></span></span></span></td>
<td><span><span><span><span><span><span>Master Top Linhas Aereas Ltd.</span></span></span></span></span></span></td>
</tr>
<tr>
<td><span><span><span><span><span><span>07Q</span></span></span></span></span></span></td>
<td><span><span><span><span><span><span>Flair Airline

## Crawler

In [3]:
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 38
page_size = 100

In [4]:
import scrapy
from bs4 import BeautifulSoup

class ReviewsSpider(scrapy.Spider):
    name = 'reviews'
    start_urls = [f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}" for i in range(1, pages + 1)]

    def parse(self, response):
        soup = BeautifulSoup(response.text, 'html.parser')
        articles_class = soup.find_all("article", class_="comp comp_reviews-airline querylist position-content")
        articles = articles_class[0].find_all("article", itemprop="review")

        for article in articles:
            review = {}
            review["datePublished"] = article.find("meta", itemprop="datePublished")["content"]
            review["ratingValue"] = article.find("span", itemprop="ratingValue").text
            review["bestRating"] = article.find("span", itemprop="bestRating").text
            review["header"] = article.find("h2", class_="text_header").text
            review["author"] = article.find("span", itemprop="name").text
            review["reviewBody"] = article.find("div", itemprop="reviewBody").text.strip()
            review["recommended"] = (
                article.find("td", class_="review-value rating-yes").text
                if article.find("td", class_="review-value rating-yes")
                else None
            )

            review_stats = {}
            for row in article.find_all("tr"):
                header = row.find("td", class_="review-rating-header")
                if header:
                    key = header.text.strip()
                    value = row.find("td", class_="review-value")
                    if value:
                        review_stats[key] = value.text.strip()
                    else:
                        stars = row.find_all("span", class_="star fill")
                        review_stats[key] = len(stars)
            review["stats"] = review_stats

            yield review

In [None]:
%%capture

if __name__ == "__main__":
    from scrapy.crawler import CrawlerProcess
    import os

    os.makedirs("reviews", exist_ok = True)
    process = CrawlerProcess(settings={
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'FEED_FORMAT': 'json',
        'FEED_URI': 'reviews/british_airline.json'
    })

    process.crawl(ReviewsSpider)
    process.start()

2024-09-17 21:11:32 [scrapy.utils.log] INFO: Scrapy 2.11.2 started (bot: scrapybot)
2024-09-17 21:11:32 [scrapy.utils.log] INFO: Versions: lxml 5.3.0.0, libxml2 2.11.7, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.7.0, Python 3.10.14 | packaged by Anaconda, Inc. | (main, May  6 2024, 19:44:50) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 24.2.1 (OpenSSL 3.3.2 3 Sep 2024), cryptography 43.0.1, Platform Windows-10-10.0.19045-SP0
2024-09-17 21:11:32 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2024-09-17 21:11:33 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2024-09-17 21:11:33 [scrapy.extensions.telnet] INFO: Telnet Password: 19f07716c190330b
  exporter = cls(crawler)

2024-09-17 21:11:33 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.teln

## Formating URL

In [5]:
import json

with open("supplementary/airport_review_url.json", "r") as fp:
    airport_review_links = json.load(fp)
airport_review_links

['https://www.airlinequality.com/airport-reviews/lucknow-airport',
 'https://www.airlinequality.com/airport-reviews/maastricht-aachen-airport',
 'https://www.airlinequality.com/airport-reviews/tokyo-narita-airport',
 'https://www.airlinequality.com/airport-reviews/kualanamu-airport',
 'https://www.airlinequality.com/airport-reviews/islamabad-airport',
 'https://www.airlinequality.com/airport-reviews/mataveri-intl-airport',
 'https://www.airlinequality.com/airport-reviews/yangon-airport',
 'https://www.airlinequality.com/airport-reviews/savannah-airport',
 'https://www.airlinequality.com/airport-reviews/zakynthos-airport',
 'https://www.airlinequality.com/airport-reviews/victoria-intl-airport',
 'https://www.airlinequality.com/airport-reviews/eindhoven-airport',
 'https://www.airlinequality.com/airport-reviews/brasilia-airport',
 'https://www.airlinequality.com/airport-reviews/punta-cana-airport',
 'https://www.airlinequality.com/airport-reviews/zhengzhou-airport',
 'https://www.airline

In [7]:
import pandas as pd

airport_df = pd.DataFrame({"url": airport_review_links})

In [28]:
airport_df['name'] = airport_df["url"].apply(lambda x: " ".join(x.split("/")[-1].split("-")).capitalize())
airport_df

Unnamed: 0,url,name
0,https://www.airlinequality.com/airport-reviews...,Lucknow airport
1,https://www.airlinequality.com/airport-reviews...,Maastricht aachen airport
2,https://www.airlinequality.com/airport-reviews...,Tokyo narita airport
3,https://www.airlinequality.com/airport-reviews...,Kualanamu airport
4,https://www.airlinequality.com/airport-reviews...,Islamabad airport
...,...,...
981,https://www.airlinequality.com/airport-reviews...,Akita airport
982,https://www.airlinequality.com/airport-reviews...,Myrtle beach airport
983,https://www.airlinequality.com/airport-reviews...,Grenoble airport
984,https://www.airlinequality.com/airport-reviews...,Baltimore washington airport


In [29]:
airport_code_df = pd.read_csv("supplementary/airport_code_name.csv")
airport_code_df

Unnamed: 0,Code,City: Airport
0,01A,"Afognak Lake, AK: Afognak Lake Airport"
1,03A,"Granite Mountain, AK: Bear Creek Mining Strip"
2,04A,"Lik, AK: Lik Mining Camp"
3,05A,"Little Squaw, AK: Little Squaw Airport"
4,06A,"Kizhuyak, AK: Kizhuyak Bay"
...,...,...
6505,ZXZ,"Waterville, WA: Waterville Airport"
6506,ZYL,"Sylhet, Bangladesh: Savannakhet"
6507,ZZU,"Mzuzu, Malawi: Mzuzu Airport"
6508,ZZV,"Zanesville, OH: Zanesville Municipal"


In [34]:
main_df = pd.read_csv("data/T_ONTIME_REPORTING.csv")
main_df

Unnamed: 0,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,...,DEST,DEST_CITY_NAME,DEP_TIME,DEP_DELAY,TAXI_OUT,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED
0,1/1/2024 12:00:00 AM,9E,4814,12478,1247805,31703,JFK,"New York, NY",11433,1143302,...,DTW,"Detroit, MI",1247.0,-5.0,31.0,1449.0,-19.0,0.0,,0.0
1,1/1/2024 12:00:00 AM,9E,4815,13487,1348702,31650,MSP,"Minneapolis, MN",11042,1104205,...,CLE,"Cleveland, OH",1001.0,-14.0,20.0,1255.0,-30.0,0.0,,0.0
2,1/1/2024 12:00:00 AM,9E,4817,12478,1247805,31703,JFK,"New York, NY",14524,1452401,...,RIC,"Richmond, VA",1411.0,-4.0,21.0,1541.0,-20.0,0.0,,0.0
3,1/1/2024 12:00:00 AM,9E,4817,14524,1452401,34524,RIC,"Richmond, VA",12478,1247805,...,JFK,"New York, NY",1643.0,-7.0,13.0,1759.0,-42.0,0.0,,0.0
4,1/1/2024 12:00:00 AM,9E,4818,11433,1143302,31295,DTW,"Detroit, MI",13342,1334207,...,MKE,"Milwaukee, WI",1010.0,-5.0,21.0,1020.0,-14.0,0.0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547266,1/31/2024 12:00:00 AM,YX,5843,12953,1295304,31703,LGA,"New York, NY",14492,1449202,...,RDU,"Raleigh/Durham, NC",1201.0,51.0,29.0,1347.0,38.0,0.0,,0.0
547267,1/31/2024 12:00:00 AM,YX,5844,12953,1295304,31703,LGA,"New York, NY",11278,1127805,...,DCA,"Washington, DC",2016.0,-14.0,16.0,2128.0,-32.0,0.0,,0.0
547268,1/31/2024 12:00:00 AM,YX,5845,10821,1082106,30852,BWI,"Baltimore, MD",12478,1247805,...,JFK,"New York, NY",1719.0,3.0,11.0,1827.0,-18.0,0.0,,0.0
547269,1/31/2024 12:00:00 AM,YX,5845,12478,1247805,31703,JFK,"New York, NY",10821,1082106,...,BWI,"Baltimore, MD",1552.0,31.0,15.0,1653.0,19.0,0.0,,0.0


In [35]:
# All Origin Found
main_df = pd.merge(main_df, airport_code_df, how='inner', left_on='ORIGIN', right_on='Code')
main_df

Unnamed: 0,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,...,DEP_TIME,DEP_DELAY,TAXI_OUT,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,Code,City: Airport
0,1/1/2024 12:00:00 AM,9E,4814,12478,1247805,31703,JFK,"New York, NY",11433,1143302,...,1247.0,-5.0,31.0,1449.0,-19.0,0.0,,0.0,JFK,"New York, NY: John F. Kennedy International"
1,1/1/2024 12:00:00 AM,9E,4817,12478,1247805,31703,JFK,"New York, NY",14524,1452401,...,1411.0,-4.0,21.0,1541.0,-20.0,0.0,,0.0,JFK,"New York, NY: John F. Kennedy International"
2,1/1/2024 12:00:00 AM,9E,4828,12478,1247805,31703,JFK,"New York, NY",12397,1239703,...,1054.0,-10.0,13.0,1151.0,-24.0,0.0,,0.0,JFK,"New York, NY: John F. Kennedy International"
3,1/1/2024 12:00:00 AM,9E,4900,12478,1247805,31703,JFK,"New York, NY",14685,1468502,...,835.0,-5.0,24.0,1056.0,-28.0,0.0,,0.0,JFK,"New York, NY: John F. Kennedy International"
4,1/1/2024 12:00:00 AM,9E,4904,12478,1247805,31703,JFK,"New York, NY",13198,1319801,...,1745.0,-5.0,30.0,1945.0,-44.0,0.0,,0.0,JFK,"New York, NY: John F. Kennedy International"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547266,1/14/2024 12:00:00 AM,MQ,4002,13076,1307605,33076,LSE,"La Crosse, WI",13930,1393008,...,1008.0,28.0,5.0,1109.0,0.0,0.0,,0.0,LSE,"La Crosse, WI: La Crosse Regional"
547267,1/20/2024 12:00:00 AM,MQ,4002,13076,1307605,33076,LSE,"La Crosse, WI",13930,1393008,...,929.0,113.0,12.0,1044.0,99.0,0.0,,0.0,LSE,"La Crosse, WI: La Crosse Regional"
547268,1/21/2024 12:00:00 AM,MQ,4002,13076,1307605,33076,LSE,"La Crosse, WI",13930,1393008,...,737.0,1.0,26.0,906.0,1.0,0.0,,0.0,LSE,"La Crosse, WI: La Crosse Regional"
547269,1/27/2024 12:00:00 AM,MQ,4002,13076,1307605,33076,LSE,"La Crosse, WI",13930,1393008,...,805.0,29.0,5.0,916.0,11.0,0.0,,0.0,LSE,"La Crosse, WI: La Crosse Regional"


In [40]:
unique_airport_name_df = pd.DataFrame({"name": main_df["City: Airport"].unique()})
unique_airport_name_df

Unnamed: 0,name
0,"New York, NY: John F. Kennedy International"
1,"Minneapolis, MN: Minneapolis-St Paul Internati..."
2,"Richmond, VA: Richmond International"
3,"Detroit, MI: Detroit Metro Wayne County"
4,"Jacksonville, FL: Jacksonville International"
...,...
329,"Ashland, WV: Tri-State/Milton J. Ferguson Field"
330,"Hagerstown, MD: Hagerstown Regional-Richard A...."
331,"St. Cloud, MN: St. Cloud Regional"
332,"Adak Island, AK: Adak"


In [41]:
from fuzzywuzzy import process

def get_url_from_name(name, df):
    match = process.extract(name, df['name'], limit=1)[0][0]
    return match, df[df['name'] == match].reset_index().loc[0, 'url']

get_url_from_name("Hong Kong", airport_df)

('Hong kong airport',
 'https://www.airlinequality.com/airport-reviews/hong-kong-airport')

In [44]:
from tqdm import tqdm

close_names, urls = list(), list()
for name in tqdm(unique_airport_name_df.name):
    close_name, url = get_url_from_name(name.split(":")[0], airport_df)
    close_names.append(close_name)
    urls.append(url)

unique_airport_name_df["matched_name"] = close_names
unique_airport_name_df["url"] = urls


100%|████████████████████████████████████████████████████████████████████████████████| 334/334 [00:16<00:00, 20.05it/s]


In [45]:
unique_airport_name_df

Unnamed: 0,name,matched_name,url
0,"New York, NY: John F. Kennedy International",New york jfk airport,https://www.airlinequality.com/airport-reviews...
1,"Minneapolis, MN: Minneapolis-St Paul Internati...",Minneapolis st paul airport,https://www.airlinequality.com/airport-reviews...
2,"Richmond, VA: Richmond International",Richmond airport,https://www.airlinequality.com/airport-reviews...
3,"Detroit, MI: Detroit Metro Wayne County",Detroit airport,https://www.airlinequality.com/airport-reviews...
4,"Jacksonville, FL: Jacksonville International",Jacksonville airport,https://www.airlinequality.com/airport-reviews...
...,...,...,...
329,"Ashland, WV: Tri-State/Milton J. Ferguson Field",Christmas island airport,https://www.airlinequality.com/airport-reviews...
330,"Hagerstown, MD: Hagerstown Regional-Richard A....",Chicago midway airport,https://www.airlinequality.com/airport-reviews...
331,"St. Cloud, MN: St. Cloud Regional",Minneapolis st paul airport,https://www.airlinequality.com/airport-reviews...
332,"Adak Island, AK: Adak",Hamilton island airport,https://www.airlinequality.com/airport-reviews...


In [46]:
unique_airport_name_df[:20]

Unnamed: 0,name,matched_name,url
0,"New York, NY: John F. Kennedy International",New york jfk airport,https://www.airlinequality.com/airport-reviews...
1,"Minneapolis, MN: Minneapolis-St Paul Internati...",Minneapolis st paul airport,https://www.airlinequality.com/airport-reviews...
2,"Richmond, VA: Richmond International",Richmond airport,https://www.airlinequality.com/airport-reviews...
3,"Detroit, MI: Detroit Metro Wayne County",Detroit airport,https://www.airlinequality.com/airport-reviews...
4,"Jacksonville, FL: Jacksonville International",Jacksonville airport,https://www.airlinequality.com/airport-reviews...
5,"New York, NY: LaGuardia",New york jfk airport,https://www.airlinequality.com/airport-reviews...
6,"Charleston, SC: Charleston AFB/International",Bucharest otopeni airport,https://www.airlinequality.com/airport-reviews...
7,"Ithaca/Cortland, NY: Ithaca Tompkins Regional",Larnaca airport,https://www.airlinequality.com/airport-reviews...
8,"Cleveland, OH: Cleveland-Hopkins International",Cleveland airport,https://www.airlinequality.com/airport-reviews...
9,"Grand Rapids, MI: Gerald R. Ford International",Grand rapids airport,https://www.airlinequality.com/airport-reviews...
