In [18]:
# Libraries
import bs4
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
import re
import pandas as pd

In [19]:
# A helper function to deal with empty elements when extracting the first element of a list
def helper(x):
    if len(x)==0:
        return None
    else:
        return x[0]
    
# A helper function to output a soup object using url
def return_soup(url):
    uClient = uReq(url)
    html = uClient.read()
    res_soup = BeautifulSoup(html, "html.parser")
    
    return res_soup

# A helper function to extract amenities
# Input: a list of strings, a word
# Output: a list of boolean values denoting whether each entry has that word
# Eg helper_amen(['a','ab','c'], 'a') = [True, True, False]
def helper_amen(li, word):
    res = [word in str(s) for s in li]
    return res

In [20]:
# A helper function to read PLANE-specific pages
def from_plane_soup(soup):
    
    # Find the comment section (return nothing if there is no comment)
    try:
        soup_comment_section = soup.find(class_="comment-box")
        
        # Extract html code for dates, names, comments, plane names
        name_plane = soup.findAll(class_="h1-fix")[0].get_text()
        dates_seats = soup_comment_section.findAll(class_="date")
        names = soup_comment_section.findAll(class_="name")
        comments = soup_comment_section.findAll(class_="comment")
    except:
        return None
    
    # Clearn up what we have above
    dates = [helper(re.findall('[0-9]{4}\/[0-9]{2}\/[0-9]{2}', s.get_text())) for s in dates_seats]
    seats = [helper(re.findall('Seat ([A-Z0-9]*) ', s.get_text())) for s in dates_seats]
    names = [helper([s.get_text().strip()]) for s in names]
    comments = [helper([s.get_text().strip()]) for s in comments]

    # Put everything into a dataframe
    df = pd.DataFrame({'Date':dates,
                       'Plane':name_plane,
                       'Seat':seats, 
                       'Name':names,
                       'Comment':comments})
    return df

In [21]:
# A helper function to read COMPANY-specific pages
def from_company_soup(soup):
    # Extract html code for plane categories, amenities and urls
    categories = soup.findAll(class_="chartsTitle")[1:]
    planes = soup.findAll(class_="seats")

    # Clearn up what we have above
    categories = [s.find("h3").get_text()[3:] for s in categories]
    urls = [s.findAll(class_="aircraft_seats") for s in planes]
    urls = [[s2.find('a', href=True)['href'] for s2 in s ] for s in urls]
    amenities = [s.findAll(id='amenities_list') for s in planes]

    # Put everything into a dataframe
    df = pd.DataFrame({'Url':urls,
                       'Category':categories})
    df = df.explode('Url')

    # Deal with amenities
    df['temp'] = [b for a in amenities for b in a]
    for amen in ['food', 'wifi', 'tv', 'headphones', 'elec']:
        df[amen.capitalize()] = helper_amen(df['temp'].tolist(), amen)
    df = df.drop('temp', axis=1)
    
    return df

In [22]:
# A helper function to read the base page where all companies are listed
def from_base_soup(soup):
    # Find the airline section
    soup_airline_section = soup.findAll(class_="browseAirlines")[0]

    # Find urls
    urls = soup_airline_section.findAll('a', href=True)

    # Clearn up what we have above
    urls = [s['href'] for s in urls]
    
    return urls

In [26]:
# A helper function to obtain an additional table using COMPANY-specific pages
def table_from_company_soup(soup):
    # Find the link of the table
    url_table = soup.find('a', string='Compare seat pitch, etc.')['href']
    soup_table = return_soup('https://www.seatguru.com' + url_table)

    # Obtain a list of dfs and its corresponding class types
    list_df = pd.read_html('https://www.seatguru.com' + url_table)
    list_class = soup_table.findAll(class_='class-of-service')

    # Clearn up what we have above
    list_class = [s.get_text() for s in list_class]
    for n in range(len(list_df)):
        list_df[n]['Class'] = list_class[n]

    # Put everything into a dataframe
    df = pd.concat(list_df,axis=0,ignore_index=True)
    df['Company'] = url_table.split('/')[2].replace('_', ' ')
    
    return df

# Begin scrapping comments

In [None]:
# Read the base page
url = 'https://www.seatguru.com/browseairlines/browseairlines.php'
list_urls_companies = from_base_soup(return_soup(url))

In [None]:
# Iterate through each company
res = []
for url_company in list_urls_companies:
    url_company = 'https://www.seatguru.com' + url_company
    res.append(from_company_soup(return_soup(url_company)))
    
    # Tracker
    if len(res) % 5 == 0:
        print(len(res), 'companies collected')
    
df_company = pd.concat(res,axis=0,ignore_index=True)
# df_company.to_pickle('df_company')

In [None]:
# Iterate through each plane for each company

# A helper function to handle the map
def helper_map(url):
    url_plane = 'https://www.seatguru.com' + url
    df = from_plane_soup(return_soup(url_plane))
    
    # Return nothing if there is no comment
    if df is None:
        return None
    else:
        df['Url'] = url
        return df

df_company = pd.read_pickle('df_company')
res = df_company['Url'].map(helper_map)
df_plane = pd.concat(res.to_list(),axis=0,ignore_index=True)
# df_plane.to_pickle('df_plane')

In [11]:
# Join the two dfs together!
df_company = pd.read_pickle('df_company')
df_plane = pd.read_pickle('df_plane')
df = df_plane.merge(df_company, on=['Url'])
#df.to_pickle('df_seatguru')

# Begin scrapping class tables

In [28]:
# Read the base page
url = 'https://www.seatguru.com/browseairlines/browseairlines.php'
list_urls_companies = from_base_soup(return_soup(url))

In [29]:
# Iterate through each company
res = []
for url_company in list_urls_companies:
    url_company = 'https://www.seatguru.com' + url_company
    res.append(table_from_company_soup(return_soup(url_company)))
    
    # Tracker
    if len(res) % 5 == 0:
        print(len(res), 'companies collected')
    
df_class = pd.concat(res,axis=0,ignore_index=True)
# df_class.to_pickle('df_class')

5 companies collected
10 companies collected
15 companies collected
20 companies collected
25 companies collected
30 companies collected
35 companies collected
40 companies collected
45 companies collected
50 companies collected
55 companies collected
60 companies collected
65 companies collected
70 companies collected
75 companies collected
80 companies collected
85 companies collected
90 companies collected
95 companies collected
100 companies collected
105 companies collected
110 companies collected
115 companies collected
120 companies collected
125 companies collected
130 companies collected
135 companies collected
140 companies collected
145 companies collected
150 companies collected
155 companies collected
160 companies collected
165 companies collected
170 companies collected
