In [1]:
# imports
import re
import requests
import pandas as pd
from scrapy import Selector

In [2]:
# get fleet info
request = requests.get('https://www.saudia.com/experience/about-us/our-fleet')
request.status_code

200

In [3]:
# extract fleet page html
html = request.text
sel = Selector(text=html)

In [4]:
# scrape aircraft names, totals, and links to detailed plane configuration data
aircraft_names = sel.xpath('//table[@border=0]//td//a//text()[contains(.," ")]').extract()
num_aircraft = [re.search(r'\d+', sel).group() 
                for sel in sel.xpath('//table[@border=0]//td/text()[contains(.,"Aircraft")]').extract()]
aircraft_links = sorted(['https://www.saudia.com' + sel
                        for sel in set(sel.xpath('//table[@border=0]/tbody/tr/td//a/@href').extract())])

fleet_data = {'aircraft_name': aircraft_names, 'number_in_fleet':num_aircraft,
              'number_of_configurations':[], 'configurations_link':aircraft_links}

In [5]:
# get configuration info for each plane from links
request_list = [requests.get(link) for link in fleet_data['configurations_link']]
[request.status_code for request in request_list]

[200, 200, 200, 200, 200, 200]

In [6]:
# get html text of each page
html_list = [request.text for request in request_list]
sel_list = [Selector(text=html) for html in html_list]

In [7]:
# extract aircraft configuration names
titles_list = [sel_list[i].xpath('//h3[@class="fs-1-3 vg-0--top"]')[::3].xpath('.//text()').extract() if i < 5
               else sel_list[i].xpath('//h2[@class="fs-1-5 vg-0--top"]/text()[1]').extract() for i in range(len(sel_list))]
for titles in titles_list:
    fleet_data['number_of_configurations'].append(len(titles))

In [8]:
# extract tables of interest (aircraft & seat specs) on each page
tables_list = [sel.xpath('//table[@class="full styled"]') for sel in sel_list]

In [9]:
# split tables by aircraft configuration
aircraft_tables_list = [{} for _ in tables_list]
for plane_index in range(len(tables_list)):
    for i in range(len(titles_list[plane_index])):
        aircraft_tables_list[plane_index][titles_list[plane_index][i]] = tables_list[plane_index][i*3:(i*3)+3]

In [10]:
# scrape configuration data for each aircraft in the fleet and capture it in a list of dicts
aircraft_data_list = [{} for _ in aircraft_tables_list]
for plane_index in range(len(aircraft_tables_list)):
    for title in titles_list[plane_index]:
        data = {}
        tables = aircraft_tables_list[plane_index][title]
        for table in tables:
            table_title = table.xpath('./caption/text()').extract_first()
            if "Aircraft Specifications" in table_title:
                headers = table.xpath('.//th/text()').extract()
                datapoints = table.xpath('.//td/text()').extract()
                if (len(headers) < len(datapoints)):
                    for extra_data in datapoints[len(headers):]:
                        datapoints[len(headers)-1] += extra_data
                    datapoints = datapoints[:len(headers)]
                for i in range(len(headers)):
                    data[headers[i]] = datapoints[i].strip()
            else:
                headers1 = table.xpath('.//th//text()').extract()[1:]
                headers2 = []
                datapoints = []
                rows = table.xpath('./tbody//tr')
                for row in rows:
                    longheader = row.xpath('.//td//text()').extract_first()
                    headers2.append(longheader)
                    if longheader != 'Amenities':
                        datapoints.append(row.xpath('.//td//text()')[1:].extract())
                    else:
                        amenities = []
                        class_amens = row.xpath('.//td')[1:]
                        for amens in class_amens:
                            amenities.append(amens.xpath('.//img/@alt').extract())
                        datapoints.append(amenities)

                for i in range(len(headers1)):
                    for j in range(len(headers2)):
                        if headers2[j] != 'Amenities':
                            if 'Accommodation' in headers2[j]:
                                data[headers1[i]+" Seats"] = re.search(r'\d+', datapoints[j][i]).group()
                            elif 'Seat width/Pitch' in headers2[j]:
                                width_pitch = datapoints[j][i]
                                width = ''
                                pitch = ''
                                if '/' in width_pitch:
                                    width_pitch = width_pitch.split('/')
                                    width = width_pitch[0].strip()
                                    pitch = width_pitch[1].strip()
                                data[headers1[i]+" Seat Width"] = width
                                data[headers1[i]+" Seat Pitch"] = pitch
                            else:
                                data[headers1[i]+" "+headers2[j]] = datapoints[j][i].strip()
                        else:
                            for amen in datapoints[j][i]:
                                data[headers1[i]+" has "+amen] = True

        aircraft_data_list[plane_index][title] = data

In [11]:
# save fleet level data as a dataframe and export it to a csv
fleet_df = pd.DataFrame(fleet_data)
fleet_df.to_csv('./datasets/saudia_fleet_data.csv', index=False)
fleet_df

Unnamed: 0,aircraft_name,number_in_fleet,number_of_configurations,configurations_link
0,Airbus A320-214,32,4,https://www.saudia.com/experience/about-us/our...
1,Airbus A321,15,1,https://www.saudia.com/experience/about-us/our...
2,Airbus A330-343,48,3,https://www.saudia.com/experience/about-us/our...
3,Boeing B777-268ER,13,2,https://www.saudia.com/experience/about-us/our...
4,Boeing B777-368ER,33,5,https://www.saudia.com/experience/about-us/our...
5,Boeing B787-9,7,1,https://www.saudia.com/experience/about-us/our...


In [12]:
# convert each aircrafts data into a dataframe and export each to a csv
for i in range(len(aircraft_names)):
    df = pd.DataFrame(aircraft_data_list[i]).T
    df.columns = df.columns.map(lambda x: x.lower().replace(' ', '_'))
    df.index.name = 'configuration_name'
    df['aircraft_name'] = fleet_df.aircraft_name[i]
    df = df[df.columns[::-1]]
    df.reset_index(inplace=True)
    for column in df.columns:
        if True in df[column].values:
            df[column].fillna(False, inplace=True)
        else:
            df[column].fillna('class not offered', inplace=True)
    df.to_csv(aircraft_names[i]+'.csv', index=False)