In [10]:
# Airline Data Extraction through Webscraping
# This class containe the code for scraping airline related data from an authentic website
# Detailed description availble through inline comments.

import re
import pandas as pd
from re import sub
from decimal import Decimal
from requests import get
from bs4 import BeautifulSoup as bs 


# Airline code data read from csv file 'Carrier_Codes.csv' and being stored under dataframe 'carrier_codes_df'
carrier_codes_df= pd.read_excel('Carrier_Codes.xlsx')
# 'carrier_code_dict' dictionary created which contains 'IATA_Carrier_Code' as key and 'Carrier_Name' as value
carrier_code_dict= dict(zip(carrier_codes_df.IATA_Carrier_Code, carrier_codes_df.Carrier_Name))



# Initialising below empty lists for storing airline data
airline_id = []
airline_name = []
food_and_beverage_rating = []
inflight_entertainment_rating = []
seat_comfort_rating = []
staff_service_rating = []
value_for_money_rating = []



# Iterating through 'carrier_code_dict' items
for carrier_key, carrier_value in carrier_code_dict.items():
    # Converting the carrier_value (Flight name) as lower case and replacing 'space' with '-'
    carrier_name = carrier_value.lower()
    carrier_name = carrier_name.replace(" ", "-")
    # Storing the desired information provider URL in 'url'
    url = "https://www.airlinequality.com/airline-reviews/"+carrier_name
    # The response is obtaine and parsed using 'html.parser'
    response = get(url)
    html_soup = bs(response.text,'html.parser')
    
    
    
    # Adding 'carrier_key'(airline ID) and 'carrier_value' (Airline Name) to respective lists
    airline_id.append(carrier_key)
    airline_name.append(carrier_value)
    
    
    # If the response is not successful (code = 200), then filling the rating values aas 'N/A' in respective lists
    if(response.status_code!=200):
        food_and_beverage_rating.append('N/A')
        inflight_entertainment_rating.append('N/A')
        seat_comfort_rating.append('N/A')
        staff_service_rating.append('N/A')
        value_for_money_rating.append('N/A')
        
    else:
        #Picking up the html block containing the review information and storing it in 'review_container'
        review_container = html_soup.find_all(class_ ="review-info")
        if(len(review_container) == 0):
            food_and_beverage_rating.append('N/A')
            inflight_entertainment_rating.append('N/A')
            seat_comfort_rating.append('N/A')
            staff_service_rating.append('N/A')
            value_for_money_rating.append('N/A')
        # Iterating through the review_container
        for individual_review in review_container:
            #Picking up all the lines with <tr> tag and iteratign through the same
            rows = individual_review.findAll('tr')
            for row in rows:
                # Creating an empty dictionary 'category_star_individual_dict' to store rating category and rating value as 
                # key value pair
                category_star_individual_dict = dict()
                all_values = row.findAll('td')
                # Picking up the first 'td' value and storing it as the category name
                category_name = all_values[0].text
                # Picking up the second 'td' value and storing it as the rating value
                star_block = all_values[1]
                # Getting the values associated with 'Star fill' class and picking up the value of maximum number 
                # which is denoted as 'star' in the website
                spans = star_block.find_all('span', {'class' : 'star fill'})
                lines = [span.get_text() for span in spans]
                # If the particular category is not been rated, then 'N/A' is stored
                if len(lines) == 0:
                    star_name = 'N/A'
                else:
                    star_name = max(lines)
                # Storing the 'category name'and 'rating value' in the 'category_star_individual_dict' dictionary  
                category_star_individual_dict.update({category_name: star_name})
                # Iterating through the 'category_star_individual_dict' items and storing the rating for the respective categories
                for key, value in category_star_individual_dict.items():
                    if re.search('Food & Beverages',key,re.IGNORECASE):
                        food_and_beverage_rating.append(value)
                    elif re.search('Inflight Entertainment',key,re.IGNORECASE):
                        inflight_entertainment_rating.append(value)
                    elif re.search('Seat Comfort',key,re.IGNORECASE):
                        seat_comfort_rating.append(value)
                    elif re.search('Staff Service',key,re.IGNORECASE):
                        staff_service_rating.append(value)
                    elif re.search('Value for Money',key,re.IGNORECASE):
                        value_for_money_rating.append(value)
                        
                        
                        
# Creating a new dataframe 'airline_rating_df' ad storing all the lists created                     
airline_rating_df = pd.DataFrame({"Airline_ID":airline_id,
                        "Airline_Name":airline_name,
                        "Food_And_Beverage_Rating":food_and_beverage_rating,  
                        "Inflight_Entertainment_Rating":inflight_entertainment_rating, 
                        "Seat_Comfort_Rating":seat_comfort_rating,
                        "Staff_Service_Rating":staff_service_rating,
                        "Value_For_Money_Rating":value_for_money_rating,
                       })

# Changing the datatype of all column values as 'String' as per the nature of data
airline_rating_df['Airline_ID']= airline_rating_df['Airline_ID'].astype(str)
airline_rating_df['Airline_Name']= airline_rating_df['Airline_Name'].astype(str)
airline_rating_df['Food_And_Beverage_Rating']= airline_rating_df['Food_And_Beverage_Rating'].astype(str)
airline_rating_df['Inflight_Entertainment_Rating']= airline_rating_df['Inflight_Entertainment_Rating'].astype(str)
airline_rating_df['Seat_Comfort_Rating']= airline_rating_df['Seat_Comfort_Rating'].astype(str)
airline_rating_df['Staff_Service_Rating']= airline_rating_df['Staff_Service_Rating'].astype(str)
airline_rating_df['Value_For_Money_Rating']= airline_rating_df['Value_For_Money_Rating'].astype(str)



# Exporting the data from  airline_rating_df dataframe into a csv by name 'Project_Cleaned_Airline.csv'
airline_rating_df.to_csv("Project_Extracted_Airline_Data.csv", index=False)



""""
CONCLUSIONS:

This code scraps data from a airline information website and stores them in relevant lists, stores them in a data frame and exports it to a CSV file.


CONTRIBUTIONS:

RAJENDRA KUMAR RAJKUMAR - 90%
MONISH  HIRISAVE RAGHU - 10%

CITATIONS:

1. https://www.geeksforgeeks.org
2. https://github.com/nikbearbrown/INFO_6210
3. stackoverflow
4. Tutorialspoint

In built functions and methods required for data cleaning, formatting options were referred from above mentioned sites

Percentage of code written - 90%
Percentage of code referred from above mentioned scources - 10%

LICENSE:

Copyright <2019> <RAJENDRA KUMAR RAJKUMAR, MONISH  HIRISAVE RAGHU>

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


"""

'"\nCONCLUSIONS:\n\nThis code scraps data from a airline information website and stores them in relevant lists, stores them in a data frame and exports it to a CSV file.\n\n\nCONTRIBUTIONS:\n\nRAJENDRA KUMAR RAJKUMAR - 100%\n\n\nCITATIONS:\n\n1. https://www.geeksforgeeks.org\n2. https://github.com/nikbearbrown/INFO_6210\n3. stackoverflow\n4. Tutorialspoint\n\nIn built functions and methods required for data cleaning, formatting options were referred from above mentioned sites\n\nPercentage of code written - 90%\nPercentage of code referred from above mentioned scources - 10%\n\nLICENSE:\n\nCopyright <2019> <RAJENDRA KUMAR RAJKUMAR, MONISH  HIRISAVE RAGHU>\n\nPermission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and

24
24
23
23
23
23
23
