# WEB SCRAPER AO - AIRLINEQUALITY

In [101]:
# Libraries

import bs4
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import pandas as pd
from functools import reduce
import numpy as np

# Sources

Air Inequality
https://www.airlinequality.com/

In [102]:
# URL 
ae_url = 'https://www.airlinequality.com/'
ae_url_airfrance = 'https://www.airlinequality.com/airline-reviews/air-france/'

# Test Get Reviews

In [103]:
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(ae_url_airfrance,headers=hdr)
page = urlopen(req)
soup = BeautifulSoup(page)


l_titles = []

bodies = soup.body

bodies = soup.find_all("div", {"class":"body"})

## Get general value

In [104]:
company = soup.find('div', {'class':'info'})
company_name = company.h1.text.strip()

l_global_value_header = []
l_global_value_stars = []

stats = soup.table
tdTags = stats.find_all("td")
for tag in tdTags:
    if 'header' in tag['class'][0]:
        l_global_value_header.append(tag.text)
    if 'stars' in tag['class'][0]:
        tag_values = tag.find_all('span', {'class':'star fill'})
        l_global_value_stars.append(int(tag_values[-1].text))

l_global = [('company_name', company_name)] + list(zip(l_global_value_header, l_global_value_stars))

In [105]:
pages = soup.find('article', {'class':'comp comp_reviews-pagination querylist-pagination position-'})
l_pages = pages.find_all('li')
max_page = int(l_pages[-2].text.strip())
max_page

102

## Get Reviews values

In [106]:
l_titles = []
l_reviewer_data = []
l_verifications = []
l_reviews = []
l_stats = []
for i in range(len(bodies)):
    title = bodies[i].h2.text
    l_titles.append(title)

    reviewer_data = bodies[i].h3.text #to be proccessed
    l_reviewer_data.append(reviewer_data)

    review = bodies[i].find("div", {'class':'text_content'})
    verification = review.a.text
    l_verifications.append(verification)
    review = review.text #to be proccessed 
    l_reviews.append(review)

    l_review_value = []
    l_review_value_header = []
    l_review_value_stars = []

    stats = bodies[i].table
    tdTags = stats.find_all("td")
    #print(tdTags)
    for tag in tdTags:
        #print(tag['class'][0], tag)
        if 'header' in tag['class'][0]:
            l_review_value_header.append(tag.text)
        if tag['class'][0] == 'review-value':
            l_review_value.append(tag.text)
        if 'stars' in tag['class'][0]:
            tag_values = tag.find_all('span', {'class':'star fill'})
            l_review_value_stars.append(int(tag_values[-1].text))
    l_review_value[-1:-1] = l_review_value_stars
    
    l_stats.append(list(zip(l_review_value_header, l_review_value)))

print(len(l_titles) == len(bodies))
print(len(l_reviewer_data) == len(bodies))
print(len(l_verifications) == len(bodies))
print(len(l_reviews)== len(bodies))
print(len(l_stats) == len(bodies))

True
True
True
True
True


In [108]:
#Transform our data to dataframes
dfs = [pd.DataFrame(l) for l in l_stats]

#Merge accoring to column 0
df = reduce(lambda left,right: pd.merge(left,right,on=0, how='outer'), dfs)

#Get the right format of dataframe
df_reviews = df.T.copy()
df_reviews.columns = df_reviews.iloc[0]
df_reviews.drop(df_reviews.index[0], inplace=True)
df_reviews.reset_index(drop=True, inplace=True)

In [109]:
df_reviews.head()

Unnamed: 0,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Value For Money,Recommended,Inflight Entertainment,Aircraft,Wifi & Connectivity
0,Solo Leisure,Economy Class,Prague to Cape Town via Paris,January 2020,1,5,1,1,1,no,,,
1,Solo Leisure,Economy Class,Toronto to Paris,December 2019,5,1,1,5,1,no,2.0,,
2,Business,Business Class,Paris to Chicago,January 2020,4,4,4,2,4,yes,4.0,A330-300,3.0
3,Solo Leisure,Business Class,Paris to Madrid,January 2020,1,2,1,3,1,no,1.0,A320,1.0
4,Solo Leisure,Economy Class,Paris to Prague,December 2019,3,4,4,1,3,yes,,A321,5.0


# Functions

In [121]:
def scraping_airlinequality(url):
    hdr = {'User-Agent': 'Mozilla/5.0'}
    req = Request(url,headers=hdr)
    page = urlopen(req)
    soup = BeautifulSoup(page)

    page_soup = BeautifulSoup(page, "html.parser")

    company = soup.find('div', {'class':'info'})
    company_name = company.h1.text.strip()

    l_global_value_header = []
    l_global_value_stars = []

    stats = soup.table
    tdTags = stats.find_all("td")
    for tag in tdTags:
        if 'header' in tag['class'][0]:
            l_global_value_header.append('global ' + tag.text)
        if 'stars' in tag['class'][0]:
            tag_values = tag.find_all('span', {'class':'star fill'})
            l_global_value_stars.append(int(tag_values[-1].text))

    l_global = [('company_name', company_name)] + list(zip(l_global_value_header, l_global_value_stars))
    
    bodies = soup.body

    bodies = soup.find_all("div", {"class":"body"})
    
    l_df = [] #a list that stores data of the url
    
    for i in range(len(bodies)):
        
        l_data_url = [] #a list that stores data of the review of the url (zip format)
        l_data_url += l_global #we add global values for each reviews
        
        title = bodies[i].h2.text
        title = title.strip('"')
        l_data_url.append(('title', title))
        
        reviewer_data = bodies[i].h3.text.strip() 
        reviewer_data = reviewer_data.split('\n')[-1]
        
        date = ' '.join(reviewer_data.split()[-3:]) #the date is the last words of reviewer data
        country = reviewer_data.split('(')[-1].split(')')[0] #we split according to '(' and ')'
        
        l_data_url.append(('date', date))
        l_data_url.append(('country', country))
        
        review = bodies[i].find("div", {'class':'text_content'})
        verification = review.a
        if verification is None:
            l_data_url.append(('verification', np.nan))
        else:
            l_data_url.append(('verification', verification.text))
        
        review_content = review.text
        review_content = review_content.split('| ')[-1]
        l_data_url.append(('review', review_content))
        
        l_review_value = []
        l_review_value_header = []
        l_review_value_stars = []

        stats = bodies[i].table
        tdTags = stats.find_all("td")
        for tag in tdTags:
            if 'header' in tag['class'][0]:
                l_review_value_header.append(tag.text)
            if tag['class'][0] == 'review-value':
                l_review_value.append(tag.text)
            if 'stars' in tag['class'][0]:
                tag_values = tag.find_all('span', {'class':'star fill'})
                if len(tag_values) != 0:
                    l_review_value_stars.append(int(tag_values[-1].text))
        l_review_value[-1:-1] = l_review_value_stars

        l_data_url += list(zip(l_review_value_header, l_review_value))
        l_df.append(l_data_url)
        
        #Transform our data to dataframes
    dfs = [pd.DataFrame(l) for l in l_df]

    #Merge accoring to column 0
    df = reduce(lambda left,right: pd.merge(left,right,on=0, how='outer'), dfs)

    #Get the right format of dataframe
    df_reviews = df.T.copy()
    df_reviews.columns = df_reviews.iloc[0]
    df_reviews.drop(df_reviews.index[0], inplace=True)
    df_reviews.reset_index(drop=True, inplace=True)
    
    return df_reviews

# Running code

Scrapping, it's better to launch it on google colab I think

scraped_companies = ['air-france',
 'american-airlines',
 'air-china',
 'lufthansa',
 'emirates',
 'ana-all-nippon-airways',
 'latam-airlines',
 'aeroflot-russian-airlines',
 'air-canada',
 'singapore-airlines']

In [None]:
results_dfs = []
for companie in scraped_companies:
    global_url = 'https://www.airlinequality.com/airline-reviews/{}/'.format(str(companie))
    print(global_url)
    hdr = {'User-Agent': 'Mozilla/5.0'}
    req = Request(global_url,headers=hdr)
    page = urlopen(req)
    soup = BeautifulSoup(page)
    pages = soup.find('article', {'class':'comp comp_reviews-pagination querylist-pagination position-'})
    l_pages = pages.find_all('li')
    max_page = int(l_pages[-2].text.strip())
    print(max_page)
    for n_page in range(max_page) :
        company_url = global_url + 'page/{}'
        url = company_url.format(str(n_page))
        print(url)
        df = scraping_airlinequality(url)
        results_dfs.append(df)
df_reviews = pd.concat(results_dfs, axis=0)

In [123]:
df_reviews.shape

(1020, 24)