## 1. Dependecy

In [31]:
# Crawler Structure: https://www.kaggle.com/code/notcostheta/skytrax-scraper

## 2. Find the URL to Reviews

### 2.1. Function

In [32]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

def get_all_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.find_all('a', href=True)

    extracted_links = [(urljoin(url, link['href']), link.get_text()) for link in links]
    
    return extracted_links

### 2.2. Airport

In [33]:
url = "https://www.airlinequality.com/review-pages/a-z-airport-reviews/"
extracted_links = get_all_links(url)
airports = [[l.strip("/"), t.strip(" ")] for l, t in extracted_links if "https://www.airlinequality.com/airport-reviews/" in l]
airports[:5]

[['https://www.airlinequality.com/airport-reviews/aalborg-airport', 'Aalborg'],
 ['https://www.airlinequality.com/airport-reviews/aarhus-airport', 'Aarhus'],
 ['https://www.airlinequality.com/airport-reviews/abbotsford-intl-airport',
  'Abbotsford Intl'],
 ['https://www.airlinequality.com/airport-reviews/aberdeen-airport',
  'Aberdeen'],
 ['https://www.airlinequality.com/airport-reviews/abidjan-airport', 'Abidjan']]

### 2.3. Airline

In [34]:
url = "https://www.airlinequality.com/review-pages/a-z-airline-reviews/"
extracted_links = get_all_links(url)
airlines = [[l.strip("/"), t.strip(" ")] for l, t in extracted_links if "https://www.airlinequality.com/airline-reviews/" in l]
airlines[:5]

[['https://www.airlinequality.com/airline-reviews/ab-aviation', 'AB Aviation'],
 ['https://www.airlinequality.com/airline-reviews/adria-airways',
  'Adria Airways'],
 ['https://www.airlinequality.com/airline-reviews/aegean-airlines',
  'Aegean Airlines'],
 ['https://www.airlinequality.com/airline-reviews/aer-lingus', 'Aer Lingus'],
 ['https://www.airlinequality.com/airline-reviews/aero-vip', 'Aero VIP']]

### 2.4. Temp Storage

In [36]:
import json
import pandas as pd

df = pd.DataFrame(airports, columns=['URL', 'NICKNAME'])
df.to_csv("../supplementary/airport_review_url.csv", index = False)

df = pd.DataFrame(airlines, columns=['URL', 'NICKNAME'])
df.to_csv("../supplementary/airline_review_url.csv", index = False)

## 3. Crawl TD (Airport Code)

In [37]:
import requests
from bs4 import BeautifulSoup

url = "https://www.bts.gov/topics/airlines-and-airports/world-airport-codes"

response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')

In [38]:
headers = [header.text for header in table.find_all('th')]
headers

['Code', 'City: Airport']

In [39]:
rows = []
for row in table.find_all('tr'):
    rows.append([data.text for data in row.find_all('td')])
rows[:5]

[[],
 ['01A', 'Afognak Lake, AK: Afognak Lake Airport'],
 ['03A', 'Granite Mountain, AK: Bear Creek Mining Strip'],
 ['04A', 'Lik, AK: Lik Mining Camp'],
 ['05A', 'Little Squaw, AK: Little Squaw Airport']]

In [40]:
import csv

with open('../supplementary/airport_code_name.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(headers)
    csvwriter.writerows(rows[1:]) 

## 4. Crawl TD (Airline Code)

In [41]:
import requests
from bs4 import BeautifulSoup
import csv

url = "https://www.bts.gov/topics/airlines-and-airports/airline-codes"

response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table').find('table')

headers = [header.text for header in table.find_all('th')]

rows = []
for row in table.find_all('tr'):
    rows.append([data.text for data in row.find_all('td')])
print(rows[:5])


with open('airline_code_name.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(rows[0])
    csvwriter.writerows(rows[1:])


[['CARRIER', 'CARRIERNAME'], ['02Q', 'Titan Airways'], ['04Q', 'Tradewind Aviation'], ['06Q', 'Master Top Linhas Aereas Ltd.'], ['07Q', 'Flair Airlines Ltd.']]


## 5. Formating URL

In [237]:
import pandas as pd

airport_url_df = pd.read_csv("../supplementary/airline_review_url.csv")
airport_url_df.loc[0, "URL"]

'https://www.airlinequality.com/airline-reviews/ab-aviation'

In [218]:
airport_actual_name = pd.read_csv("../supplementary/airport_actual_names.csv")
airport_actual_name[780:800]

Unnamed: 0,NICKNAME,ACTUAL_NAME
780,Chania,Chania International Airport
781,Charlotte,Charlotte Douglas International Airport
782,Chattanooga Metropolitan,Chattanooga Metropolitan Airport
783,Chengdu,Chengdu Shuangliu International Airport
784,Chennai,Chennai International Airport
785,Chiang Mai,Chiang Mai International Airport
786,Chiang Rai,Chiang Rai International Airport
787,Chicago Midway,Chicago Midway International Airport
788,Chicago O'Hare,Chicago O'Hare International Airport
789,Chisinau,Chișinău International Airport


In [219]:
airport_actual_name_url = pd.merge(airport_url_df, airport_actual_name, left_on='NICKNAME', right_on='NICKNAME', how='inner')
airport_actual_name_url[airport_actual_name_url.NICKNAME == "Seattle"]

Unnamed: 0,URL,NICKNAME,ACTUAL_NAME
806,https://www.airlinequality.com/airport-reviews...,Seattle,Seattle/Tacoma International Airport


In [240]:
airport_code_df = pd.read_csv("../supplementary/airline_code_name.csv", encoding='latin-1')
airport_code_df

Unnamed: 0,CARRIER,CARRIERNAME
0,02Q,Titan Airways
1,04Q,Tradewind Aviation
2,06Q,Master Top Linhas Aereas Ltd.
3,07Q,Flair Airlines Ltd.
4,09Q,"Swift Air, LLC d/b/a Eastern Air Lines d/b/a E..."
...,...,...
476,ZG,ZIPAIR Tokyo Inc.
477,ZK,Great Lakes Airlines
478,ZPQ,"Silk Way Airlines, LLC"
479,ZW,Air Wisconsin Airlines Corp


In [241]:
import pandas as pd
from tqdm import tqdm

chucks = pd.read_csv("../data/Total_Data_10Y.csv", chunksize = 100000, low_memory = False)

df_list = list()
for i, chuck in tqdm(enumerate(chucks)):
    df_list.append(chuck.sample(500))

df = pd.concat(df_list)
df

624it [02:11,  4.75it/s]


Unnamed: 0,FL_DATE,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,...,TAXI_IN,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
70183,2014-07-05,MQ,N671MQ,1393003,30977,ORD,"Chicago, IL",1100303,31003,CID,...,4.0,2236.0,-9.0,0.0,,,,,,
14841,2014-07-01,WN,N638SW,1320402,31454,MCO,"Orlando, FL",1334205,33342,MKE,...,5.0,1714.0,-1.0,0.0,,,,,,
98696,2014-07-07,DL,N919AT,1468502,34685,SAV,"Savannah, GA",1039705,30397,ATL,...,5.0,623.0,-12.0,0.0,,,,,,
60600,2014-07-04,US,N906AW,1410702,30466,PHX,"Phoenix, AZ",1105703,31057,CLT,...,6.0,1534.0,-4.0,0.0,,,,,,
73027,2014-07-05,UA,N76503,1402702,34027,PBI,"West Palm Beach/Palm Beach, FL",1161802,31703,EWR,...,16.0,2013.0,-3.0,0.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62375255,2018-12-31,AA,N966NN,1129806,30194,DFW,"Dallas/Fort Worth, TX",1348702,31650,MSP,...,3.0,1409.0,-23.0,0.0,,,,,,
62304215,2018-12-27,OO,N782SK,1226603,31453,IAH,"Houston, TX",1037205,30372,ASE,...,3.0,1657.0,36.0,0.0,,0.0,0.0,0.0,0.0,36.0
62374340,2018-12-31,AA,N815AW,1319801,33198,MCI,"Kansas City, MO",1129806,30194,DFW,...,15.0,1119.0,-6.0,0.0,,,,,,
62339960,2018-12-29,DL,N982DL,1052906,30529,BDL,"Hartford, CT",1039707,30397,ATL,...,11.0,2144.0,4.0,0.0,,,,,,


In [242]:
grouped_df = df.groupby('OP_UNIQUE_CARRIER').size().reset_index(name='count')
airport_code_df = pd.merge(grouped_df, airport_code_df, left_on='OP_UNIQUE_CARRIER', right_on='CARRIER', how='inner')
airport_code_df

Unnamed: 0,OP_UNIQUE_CARRIER,count,CARRIER,CARRIERNAME
0,9E,7558,9E,Endeavor Air Inc.
1,AA,41637,AA,American Airlines Inc.
2,AS,9965,AS,Alaska Airlines Inc.
3,B6,12805,B6,JetBlue Airways
4,DL,43959,DL,Delta Air Lines Inc.
5,EV,10680,EV,ExpressJet Airlines LLC d/b/a aha!
6,F9,6270,F9,Frontier Airlines Inc.
7,G4,3477,G4,Allegiant Air
8,HA,3679,HA,Hawaiian Airlines Inc.
9,MQ,10950,MQ,Envoy Air


In [252]:
airport_code_df.to_csv("temp.csv", index = False)

In [223]:
exclude = "TTN, STS, YUM, OAJ, HDN, MBS, BRO, AVP, BQN, EGE, GTF, BFL, FAY, GRK, KTN, SGU, LAN, DLH, DRO, AZA, ACY, BGR, CRW, EVV, BIL, HRL, PIA, MFE, MLI, ABE, AGS, GNV, GJT, CHO, TVC, FNT, MSO, LFT, JNU, SBP, MKE, BUR, CHS, BOI, SDF, ANC, PVD, GEG, TYS, KOA, LIH, LIT, FAT, SRQ, DAY, CID, MAF, AVL, JAN, VPS, HSV, SBA, LBB, FSD, ECP, EUG, BTR, CHA, ILM, AMA, RDM"
exclude = set(exclude.split(", "))
len(exclude)

71

In [243]:
airport_code_df.sort_values("count", ascending = False, inplace = True)
airport_code_df = airport_code_df[:200]#[~airport_code_df.ORIGIN.isin(exclude)]
airport_code_df.reset_index(inplace = True)
airport_code_df

Unnamed: 0,index,OP_UNIQUE_CARRIER,count,CARRIER,CARRIERNAME
0,15,WN,63426,WN,Southwest Airlines Co.
1,4,DL,43959,DL,Delta Air Lines Inc.
2,1,AA,41637,AA,American Airlines Inc.
3,12,OO,34846,OO,SkyWest Airlines Inc.
4,14,UA,28030,UA,United Air Lines Inc.
5,3,B6,12805,B6,JetBlue Airways
6,9,MQ,10950,MQ,Envoy Air
7,5,EV,10680,EV,ExpressJet Airlines LLC d/b/a aha!
8,2,AS,9965,AS,Alaska Airlines Inc.
9,17,YX,9672,YX,Republic Airline


In [225]:
def _get_base_name(name):
    new_name = name.split(":")[-1]
    if "Airport" not in new_name:
        new_name += " Airport"
    return new_name
airport_code_df["name"] = airport_code_df["City: Airport"].apply(_get_base_name)
airport_code_df

Unnamed: 0,index,ORIGIN,count,Code,City: Airport,name
0,23,ATL,17505,ATL,"Atlanta, GA: Hartsfield-Jackson Atlanta Intern...",Hartsfield-Jackson Atlanta International Airport
1,266,ORD,13757,ORD,"Chicago, IL: Chicago O'Hare International",Chicago O'Hare International Airport
2,103,DFW,13012,DFW,"Dallas/Fort Worth, TX: Dallas/Fort Worth Inter...",Dallas/Fort Worth International Airport
3,102,DEN,12229,DEN,"Denver, CO: Denver International",Denver International Airport
4,206,LAX,9856,LAX,"Los Angeles, CA: Los Angeles International",Los Angeles International Airport
...,...,...,...,...,...,...
124,11,AEX,120,AEX,"Alexandria, LA: Alexandria International",Alexandria International Airport
125,46,BMI,118,BMI,"Bloomington/Normal, IL: Central Illinois Regional",Central Illinois Regional Airport
126,95,DAB,116,DAB,"Daytona Beach, FL: Daytona Beach International",Daytona Beach International Airport
127,312,ROA,116,ROA,"Roanoke, VA: Roanoke Blacksburg Regional Woodr...",Roanoke Blacksburg Regional Woodrum Field Air...


In [17]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [245]:
airport_url_df

Unnamed: 0,URL,NICKNAME
0,https://www.airlinequality.com/airline-reviews...,AB Aviation
1,https://www.airlinequality.com/airline-reviews...,Adria Airways
2,https://www.airlinequality.com/airline-reviews...,Aegean Airlines
3,https://www.airlinequality.com/airline-reviews...,Aer Lingus
4,https://www.airlinequality.com/airline-reviews...,Aero VIP
...,...,...
570,https://www.airlinequality.com/airline-reviews...,Read more
571,https://www.airlinequality.com/airline-reviews...,ITA Airways231 Reviews
572,https://www.airlinequality.com/airline-reviews...,Read more
573,https://www.airlinequality.com/airline-reviews...,TAP Portugal1548 Reviews


In [248]:
from fuzzywuzzy import process
import re
split_chars = " |,|-|/|'"


def get_url_from_name(name, df):
    match = process.extract(name, df['NICKNAME'], limit=1)[0][0]
    return match, df[df['NICKNAME'] == match].reset_index().loc[0, 'URL']
"""
def find_string_match(str1, str2s):
    
    words_str1 = re.split(split_chars, str1)
    words_str2s = [re.split(split_chars, str2) for str2 in str2s]
    matching_score = [len(set(words_str1) & set(str2)) for str2 in words_str2s]
    #if "Atlanta" in str1: print(words_str1, words_str2s, matching_score)
    return str2s[matching_score.index(max(matching_score))]

def get_url_from_name(name, df):
    match = find_string_match(name, df['ACTUAL_NAME'])
    return match, df[df['ACTUAL_NAME'] == match].reset_index().loc[0, 'URL']
"""
get_url_from_name("United", airport_url_df)

('China United Airlines',
 'https://www.airlinequality.com/airline-reviews/china-united-airlines')

In [249]:
from tqdm import tqdm

close_names, urls = list(), list()
for name in tqdm(airport_code_df.CARRIERNAME):
    close_name, url = get_url_from_name(name, airport_url_df)
    close_names.append(close_name)
    urls.append(url)

airport_code_df["matched_name"] = close_names
airport_code_df["urls"] = urls



100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [00:03<00:00,  5.84it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airport_code_df["matched_name"] = close_names
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airport_code_df["urls"] = urls


In [250]:
airport_code_df

Unnamed: 0,index,OP_UNIQUE_CARRIER,count,CARRIER,CARRIERNAME,matched_name,urls
0,15,WN,63426,WN,Southwest Airlines Co.,Southwest Airlines,https://www.airlinequality.com/airline-reviews...
1,4,DL,43959,DL,Delta Air Lines Inc.,Delta Air Lines,https://www.airlinequality.com/airline-reviews...
2,1,AA,41637,AA,American Airlines Inc.,American Airlines,https://www.airlinequality.com/airline-reviews...
3,12,OO,34846,OO,SkyWest Airlines Inc.,SkyWest Airlines,https://www.airlinequality.com/airline-reviews...
4,14,UA,28030,UA,United Air Lines Inc.,Air Algerie,https://www.airlinequality.com/airline-reviews...
5,3,B6,12805,B6,JetBlue Airways,Jetblue Airways,https://www.airlinequality.com/airline-reviews...
6,9,MQ,10950,MQ,Envoy Air,Air Canada rouge,https://www.airlinequality.com/airline-reviews...
7,5,EV,10680,EV,ExpressJet Airlines LLC d/b/a aha!,Aegean Airlines,https://www.airlinequality.com/airline-reviews...
8,2,AS,9965,AS,Alaska Airlines Inc.,Alaska Airlines,https://www.airlinequality.com/airline-reviews...
9,17,YX,9672,YX,Republic Airline,Air North Yukon's Airline,https://www.airlinequality.com/airline-reviews...


In [233]:
airport_code_df.drop(columns = ["index", "Code"], inplace = True)

In [234]:
airport_code_df.to_csv("top_129_airport_review_link.csv")

## 6. Crawl Reviews

In [42]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

class ReviewsScraper:
    def __init__(self, url_map, page_size = 100, max_page = 50):
        self.url_map = url_map
        self.page_size = page_size
        self.max_page = max_page

    def scrape(self):
        for name, base_url in tqdm(self.url_map.items()):
            for url in [f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={self.page_size}" for i in range(1, self.max_page + 1)]:
                response = requests.get(url)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    articles_class = soup.find_all("article", class_="comp comp_reviews-airline querylist position-content")
                    articles = articles_class[0].find_all("article", itemprop="review")
                    try:
                        for article in articles:
                            review = {}
                            review["datePublished"] = article.find("meta", itemprop="datePublished")["content"]
                            review["ratingValue"] = article.find("span", itemprop="ratingValue").text
                            review["bestRating"] = article.find("span", itemprop="bestRating").text
                            review["header"] = article.find("h2", class_="text_header").text
                            review["author"] = article.find("span", itemprop="name").text
                            review["reviewBody"] = article.find("div", itemprop="reviewBody").text.strip()
                            review["recommended"] = (
                                article.find("td", class_="review-value rating-yes").text
                                if article.find("td", class_="review-value rating-yes")
                                else None
                            )
        
                            review_stats = {}
                            for row in article.find_all("tr"):
                                header = row.find("td", class_="review-rating-header")
                                if header:
                                    key = header.text.strip()
                                    value = row.find("td", class_="review-value")
                                    if value:
                                        review_stats[key] = value.text.strip()
                                    else:
                                        stars = row.find_all("span", class_="star fill")
                                        review_stats[key] = len(stars)
                            review["stats"] = review_stats
        
                            yield name, review
                    except:
                        break



### 6.1. Airport

In [79]:
import pandas as pd

url_map = {row['ORIGIN']: row['urls'] for i, row in pd.read_csv("../top_129_airport_review_link.csv").iterrows()}
pages = 38
page_size = 100
len(url_map)

129

In [80]:
scraper = ReviewsScraper(url_map)
ongoing_name = ""
dict_list = list()
for name, review in scraper.scrape():
    if name != ongoing_name:
        if len(dict_list) >= 1:
            pd.DataFrame(dict_list).to_csv(f"../reviews/airport/{ongoing_name}.csv", index = False)
        dict_list = [review]
        ongoing_name = name
    else:
        dict_list.append(review)

  1%|▋                                                                                 | 1/129 [00:08<17:34,  8.24s/it]

https://www.airlinequality.com/airport-reviews/atlanta-hartsfield-airport is Done


  2%|█▎                                                                                | 2/129 [00:13<14:02,  6.64s/it]

https://www.airlinequality.com/airport-reviews/chicago-ohare-airport is Done


  2%|█▉                                                                                | 3/129 [00:17<11:03,  5.26s/it]

https://www.airlinequality.com/airport-reviews/dallas-fort-worth-airport is Done


  3%|██▌                                                                               | 4/129 [00:20<09:25,  4.53s/it]

https://www.airlinequality.com/airport-reviews/denver-airport is Done


  4%|███▏                                                                              | 5/129 [00:26<09:59,  4.83s/it]

https://www.airlinequality.com/airport-reviews/los-angeles-lax-airport is Done


  4%|███▏                                                                              | 5/129 [00:27<11:27,  5.54s/it]


KeyboardInterrupt: 

In [81]:
%%capture --no-stderr

from glob import glob
import pandas as pd
import ast
import os

STAT_KEYS = {
    "travellerType": "Type Of Traveller",
    "queueTime": "Queuing Times",
    "terminalCleanliness": "Terminal Cleanliness",
    "terminalSeating": "Terminal Seating",
    "terminalSign": "Terminal Signs",
    "foodBeverage": "Food Beverages",
    "airportShopping": "Airport Shopping",
    "wifiConnectivity": "Wifi Connectivity",
    "airportStaff": "Airport Staff",
    "statRecommended": "Recommended"
}

def cal_sentiment(x):
    if x == "yes":
        return 1
    elif x == "no":
        return -1
    else:
        return 0
        
for directory in glob("../reviews/airport/*.csv"):
    try:
        df = pd.read_csv(directory)
    
        # Unpacked Dictionary of Extra Info
        for key, value in STAT_KEYS.items():
            df[key] = df['stats'].apply(lambda x: ast.literal_eval(x).get(value, None))
    
        # Replace Recommended if null
        df.loc[df['recommended'].isnull(), 'recommended'] = df['statRecommended']
    
        # Reorganized columns with more than one type of data
        df['ratingValue'] = df['ratingValue']/df['bestRating']
        df['commentHeader'] = df["header"].apply(lambda x:x.strip('"'))
        df['tripVerified'] = df["reviewBody"].apply(lambda x:"Trip Verified" in x.split("|")[0])
        df['reviewContent'] = df["reviewBody"].apply(lambda x:''.join(x.split("|")[1:]))
        df['sentiment'] = df["recommended"].apply(cal_sentiment)
    
        # Remove unneeded columns
        df.drop(
            columns = ['bestRating', "author", "reviewBody", "recommended", "stats", "statRecommended", "header"], 
            inplace = True
        )
        
        os.makedirs(f"../reviews/cleaned_airport/", exist_ok = True)
        df.to_csv(f"../reviews/cleaned_airport/{os.path.basename(directory)}", index = False)
    except:
        print(directory)
    

  df.loc[df['recommended'].isnull(), 'recommended'] = df['statRecommended']
  df.loc[df['recommended'].isnull(), 'recommended'] = df['statRecommended']
  df.loc[df['recommended'].isnull(), 'recommended'] = df['statRecommended']
  df.loc[df['recommended'].isnull(), 'recommended'] = df['statRecommended']
  df.loc[df['recommended'].isnull(), 'recommended'] = df['statRecommended']
  df.loc[df['recommended'].isnull(), 'recommended'] = df['statRecommended']
  df.loc[df['recommended'].isnull(), 'recommended'] = df['statRecommended']
  df.loc[df['recommended'].isnull(), 'recommended'] = df['statRecommended']
  df.loc[df['recommended'].isnull(), 'recommended'] = df['statRecommended']
  df.loc[df['recommended'].isnull(), 'recommended'] = df['statRecommended']
  df.loc[df['recommended'].isnull(), 'recommended'] = df['statRecommended']
  df.loc[df['recommended'].isnull(), 'recommended'] = df['statRecommended']
  df.loc[df['recommended'].isnull(), 'recommended'] = df['statRecommended']
  df.loc[df[

### 6.2. Airline

In [76]:
import pandas as pd
import math

df = pd.read_csv("../top_18_airline_review_link.csv")
df.dropna(how = "any", inplace = True)
url_map = {row['OP_UNIQUE_CARRIER']: row['urls'] for i, row in df.iterrows()}
pages = 38
page_size = 100
url_map

{'WN': 'https://www.airlinequality.com/airline-reviews/southwest-airlines',
 'DL': 'https://www.airlinequality.com/airline-reviews/delta-air-lines',
 'AA': 'https://www.airlinequality.com/airline-reviews/american-airlines',
 'OO': 'https://www.airlinequality.com/airline-reviews/skywest-airlines',
 'UA': 'https://www.airlinequality.com/airline-reviews/united-airlines',
 'B6': 'https://www.airlinequality.com/airline-reviews/jetblue-airways',
 'MQ': 'https://www.airlinequality.com/airline-reviews/american-eagle',
 'AS': 'https://www.airlinequality.com/airline-reviews/alaska-airlines',
 'NK': 'https://www.airlinequality.com/airline-reviews/spirit-airlines',
 '9E': 'https://www.airlinequality.com/airline-reviews/delta-air-lines',
 'OH': 'https://www.airlinequality.com/airline-reviews/skywest-airlines',
 'F9': 'https://www.airlinequality.com/airline-reviews/frontier-airlines',
 'YV': 'https://www.airlinequality.com/airline-reviews/united-airlines',
 'HA': 'https://www.airlinequality.com/airl

In [72]:
scraper = ReviewsScraper(url_map)
ongoing_name = ""
dict_list = list()
for name, review in scraper.scrape():
    if name != ongoing_name:
        if len(dict_list) >= 1:
            pd.DataFrame(dict_list).to_csv(f"../reviews/airline/{ongoing_name}.csv", index = False)
        dict_list = [review]
        ongoing_name = name
    else:
        dict_list.append(review)
pd.DataFrame(dict_list)

  0%|                                                                                           | 0/16 [00:01<?, ?it/s]


KeyboardInterrupt: 

In [78]:
%%capture --no-stderr

from glob import glob
import pandas as pd
import ast
import os

STAT_KEYS = {
    "travellerType": "Type Of Traveller",
    "seatType": "Seat Type",
    "route": "Route",
    "seatComfort": "Seat Comfort",
    "foodBeverage": "Food & Beverages",
    "cabinService": "Cabin Staff Service",
    "inflightEntertainment": "Inflight Entertainment",
    "wifiConnectivity": "Wifi & Connectivity",
    "groundService": "Ground Service",
    "valueForMoney": "Value For Money",
    "statRecommended": "Recommended"
}

def cal_sentiment(x):
    if x == "yes":
        return 1
    elif x == "no":
        return -1
    else:
        return 0
        
for directory in glob("../reviews/airline/*.csv"):
    try:
        df = pd.read_csv(directory)
    
        # Unpacked Dictionary of Extra Info
        for key, value in STAT_KEYS.items():
            df[key] = df['stats'].apply(lambda x: ast.literal_eval(x).get(value, None))
    
        # Replace Recommended if null
        df.loc[df['recommended'].isnull(), 'recommended'] = df['statRecommended']
    
        # Reorganized columns with more than one type of data
        df['ratingValue'] = df['ratingValue']/df['bestRating']
        df['commentHeader'] = df["header"].apply(lambda x:x.strip('"'))
        df['tripVerified'] = df["reviewBody"].apply(lambda x:"Trip Verified" in x.split("|")[0])
        df['reviewContent'] = df["reviewBody"].apply(lambda x:''.join(x.split("|")[1:]))
        df['sentiment'] = df["recommended"].apply(cal_sentiment)
    
        # Remove unneeded columns
        df.drop(
            columns = ['bestRating', "author", "reviewBody", "recommended", "stats", "statRecommended", "header"], 
            inplace = True
        )
        
        os.makedirs(f"../reviews/cleaned_airline/", exist_ok = True)
        df.to_csv(f"../reviews/cleaned_airline/{os.path.basename(directory)}", index = False)
    except:
        print(directory)
    