### Importations 

In [1]:
import re
import json
import requests
import pandas as pd
from langdetect import detect
from bs4 import BeautifulSoup as bs
from urllib.request import Request, urlopen
from datetime import date, datetime, timedelta

In [2]:
def getAirportLinks(text_airport: str):
    """ Airport Recovery 
    Parmeters: 
        text_airport = description of airport
    Outers:
        airport = name of airport
        link = link of airport
    """ 
    airport = re.findall("\">(.*?)</a></li>", str(text_airport))[0]
    link = re.findall("href\=(.*?)>", str(text_airport))[0].replace("\"", "")
    return airport, link


def createDictionnary():
    """  Creation of the dictionary containing the name of the airport and its URL 
    Outers: 
        dic = dictionnary having the name of airport and link
    """
    dic = {}
    root = "https://www.airlinequality.com"
    url_page = root+"/review-pages/a-z-airport-reviews/"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}
    req = Request(url_page, headers=headers)
    webpage = urlopen(req).read()
    soup = bs(webpage, 'html.parser')

    r = soup.find_all('li')
    list_text = [str(val) for val in r if "href=\"/airport-reviews/" in str(val)
                 and "article" not in str(val)]
    for texte in list_text:
        airport, link = getAirportLinks(texte)
        dic[airport.rstrip()] = root+link

    return dic


dic = createDictionnary()

In [110]:
def translate (texte) : 
    try :
        new = str(TextBlob(texte).translate(to='en'))
        return new
    except :
        return texte

In [3]:
def recovTextBetweenTags(texts: str, separator: str):
    """ Retrieving code between two tags
    
    Paramters:
        texts = Part of soup
        separator = Separator of soup
    Outers:
        description = Text wanted
    """ 
    text_clean = []
    lisI = []
    lisS = []

    for i in range(0, len(texts)):
        if str(texts[i]) == "<":
            lisI.append(i)
        if texts[i] == '>':
            lisS.append(i)

    len_lis = len(lisI)
    for h in range(0, len_lis):
        if h < (len_lis-1):
            text_clean.append(texts[lisS[h]:lisI[h+1]])

    if separator != 'non':
        description = str(text_clean).replace('>', '').replace(
            ',', '').replace('\'', '').replace('，', '')
        description = description.split(separator)
    else:
        description = text_clean

    return description

In [116]:
def scrap(dic: dict, nb:int):
    """ Code allows to recup datas
    Paramters: 
        dic = dictionnary having name and link airport
    Outer:
        dataAirline = dataFrame having scrap informations
    
    """
    name_col = ['Airline_Name', 'Airline_Type', 'Region_Operation', 'Aircraft_Type', 'Cabin_Class', 'Type_Of_Lounge',
                'Type_Of_Traveller', 'Date_Visit', 'Date_Flown', 'Airport', 'Route', 'Category', 'Category_Detail',
                'Cabin_Staff_Service', 'Lounge_Staff_Service', 'Bar_And_Beverages', 'Food_And_Beverages', 'Ground_Service', 'Catering', 'Cleanliness',
                'Lounge_Comfort', 'Aisle_Space', 'Wifi_And_Connectivity', 'Inflight_Entertainment', 'Viewing_Tv_Screen', 'Power_Supply',
                'Seat', 'Seat_type', 'Seat_Comfort', 'Seat_Legroom', 'Seat_Storage', 'Seat_Width', 'Seat_Recline', 'Washrooms',
                'Value_For_Money', 'Overall_Customer_Rating', 'Overall_Service_Rating', 'Overall_Airline_Rating',
                'Recommended', 'Departure_city', 'Arrival_city', 'Nb_bus_taken', 'Nb_train_taken',
                'Nb_car_taken', 'Nb_plane_taken', 'Duration', 'Price_min', 'Price_max', 'Nb_sharing', 'Awards', 'Registration', 'Language',
                'Queuing Times', 'Terminal_Seating', 'Terminal Signs', 'Airport_Shopping', 'Experience_At_Airport', 'Date_Review']

    dataAirline = pd.DataFrame(columns=name_col)

    for dic_key, dic_val in dic.items():
        r = requests.get(dic_val)
        page = r.text
        soup = bs(page, 'html.parser')
        nb_page = Nb_pages(soup)

        for j in range(1, nb_page+1):
            r = requests.get(dic_val + '/page/' + str(j) + '/')
            page = r.text
            soup = bs(page, 'html.parser')

            Date_Review = dateReview(soup, nb)
            
            title = title_comm(soup, nb)
            desc = description(soup, nb)
            note = UserNot(soup, nb)
            notGlo = NoteGlobal(soup, nb)
            

            airport = []
            for i in range(0, len(desc)):
                airport.append(dic_key)

            df = pd.DataFrame(data=[title, desc, note, airport])
            df = df.transpose()

            Title = df[0]
            Review = df[1]
            Date_Visit, Terminal_Cleanliness, Food_Beverages, Wifi_Connectivity, Airport_Staff, Recommended, Type_Of_Traveller, Queuing_Times, Terminal_Seating, Terminal_Signs, Airport_Shopping, Experience_At_Airport = transformColInDic(
                df[2])
            Airport = df[3]

            df_template = pd.DataFrame({'Date_Flown': Date_Visit, 'Cleanliness': Terminal_Cleanliness, 'Food_And_Beverages': Food_Beverages,
                                        'Wifi_And_Connectivity': Wifi_Connectivity, 'Cabin_Staff_Service': Airport_Staff, 'Overall_Customer_Rating': notGlo,
                                        'Recommended': Recommended, 'Title': Title, 'Review': Review, 'Airport': Airport, 'Type_Of_Traveller': Type_Of_Traveller,
                                        'Queuing_Times': Queuing_Times, 'Terminal_Seating': Terminal_Seating, 'Terminal_Signs': Terminal_Signs,
                                        'Airport_Shopping': Airport_Shopping, 'Experience_At_Airport': Experience_At_Airport, 'Date_Review': Date_Review})

            dataAirline = pd.concat([dataAirline, df_template])

    return dataAirline

In [5]:
def UserNot(soup: str, nb:int):
    """ Function to retrieve users' notes using the two notes functions. 
        The first one retrieves the maximum notes for each user. 
        The second one retrieves the category 'NA' 
    
    """
    list_not = notation2(soup, nb)
    noteUser = []
    value = []
    list_total = [' 1', '2', '3', '4', '5']
    for z in range(0, len(list_not)):
        dico = {}
        del list_not[z][0]
        for i in range(0, len(list_not[z])-2):
            if len(str(list_not[z][i]).replace(' ', '')) > 1:
                if len(str(list_not[z][i+1]).replace(' ', '')) > 1:
                    if list_not[z][i] not in value:
                        dico[list_not[z][i]] = list_not[z][i+1]
                        value.append(list_not[z][i+1])
                else:
                    j = i
                    while str(list_not[z][j+1]) in list_total:
                        dico[list_not[z][i]] = list_not[z][j+1]
                        j = j + 1
        noteUser.append(dico)

    counter_user = 0
    c_user_not_w_NA = 0
    p = notation(soup, nb)
    for k in noteUser:
        value = []
        t = 0
        for key, val in k.items():
            if val != 'N/A':
                if val == '5':
                    noteUser[counter_user][key] = p[c_user_not_w_NA][t]
                    t = t + 1
                    if t == len(p[c_user_not_w_NA]):
                        c_user_not_w_NA = c_user_not_w_NA + 1

        counter_user = counter_user + 1
    return noteUser

In [6]:
# Transform a date to standard format
def format_date(date):
    # Transform a string date into a standard format by trying each
    # date format. If you want to add a format, add a try/except in the
    # last except
    # date : str : the date to transform
    # return : m : timedata : format is YYYY-MM-DD HH:MM:SS
    date_str = date
    #
    date_str = date_str.replace("st","").replace("th","")        .replace("nd","").replace("rd","").replace(" Augu "," Aug ")
    m = None
    try:
        m = datetime.strptime(date_str, "%d %B %Y")
    except ValueError:
        try:
            m = datetime.strptime(date_str, "%d %b %Y")
        except ValueError:
            try:
                m = datetime.strptime(date_str, "%Y/%m/%d")
            except ValueError:
                try:
                    m = datetime                        .strptime(date_str,"%d/%m/%Y %H:%M:%S")
                except ValueError:
                    try:
                        m = datetime                            .strptime(date_str, "%Y-%m-%d %H:%M:%S")
                    except ValueError:
                        try :
                            m = datetime.strptime(date_str,
                                                       "%d %m %Y")
                        except ValueError:
                            # HERE ADD A FORMAT TO CHECK
                            print("Format not recognised. \nConsider "
                                  "adding a date format "
                                  "in the function \"format_date\".")

    return m

In [7]:
def transformColInDic(col_dic: dict):
    # Prend en paramètre la colonne du DF contenant le dictionnaire des notes
    Date_Visit = []
    Terminal_Cleanliness = []
    Food_Beverages = []
    Wifi_Connectivity = []
    Airport_Staff = []
    Recommended = []
    Type_Of_Traveller = []
    Queuing_Times = []
    Terminal_Seating = []
    Airport_shopping = []
    Terminal_Signs = []
    Experience_At_Airport = []

    for i in range(0, len(col_dic)):
        if 'Date Visit' in (col_dic[i]).keys():
            Date_Visit.append((col_dic[i]['Date Visit']))
        else:
            Date_Visit.append(' ')

        if ' Terminal Cleanliness' in (col_dic[i]).keys():
            Terminal_Cleanliness.append((col_dic[i][' Terminal Cleanliness']))
        else:
            Terminal_Cleanliness.append(' ')

        if ' Food Beverages' in (col_dic[i]).keys():
            Food_Beverages.append((col_dic[i][' Food Beverages']))
        else:
            Food_Beverages.append(' ')

        if ' Wifi Connectivity' in (col_dic[i]).keys():
            Wifi_Connectivity.append((col_dic[i][' Wifi Connectivity']))
        else:
            Wifi_Connectivity.append(' ')

        if ' Airport Staff' in (col_dic[i]).keys():
            Airport_Staff.append((col_dic[i][' Airport Staff']))
        else:
            Airport_Staff.append(' ')

        if ' Recommended' in (col_dic[i]).keys():
            Recommended.append((col_dic[i][' Recommended']))
        else:
            Recommended.append(' ')

        if 'Type Of Traveller' in (col_dic[i]).keys():
            Type_Of_Traveller.append((col_dic[i]['Type Of Traveller']))
        else:
            Type_Of_Traveller.append(' ')

        if 'Queuing Times' in (col_dic[i]).keys():
            Queuing_Times.append((col_dic[i]['Queuing Times']))
        else:
            Queuing_Times.append(' ')

        if ' Terminal Seating' in (col_dic[i]).keys():
            Terminal_Seating.append((col_dic[i][' Terminal Seating']))
        else:
            Terminal_Seating.append(' ')

        if ' Airport Shopping' in (col_dic[i]).keys():
            Airport_shopping.append((col_dic[i][' Airport Shopping']))
        else:
            Airport_shopping.append(' ')

        if ' Terminal Signs' in (col_dic[i]).keys():
            Terminal_Signs.append((col_dic[i][' Terminal Signs']))
        else:
            Terminal_Signs.append(' ')

        if 'Experience At Airport' in (col_dic[i]).keys():
            Experience_At_Airport.append((col_dic[i]['Experience At Airport']))
        else:
            Experience_At_Airport.append(' ')

    return Date_Visit, Terminal_Cleanliness, Food_Beverages, Wifi_Connectivity, Airport_Staff, Recommended, Type_Of_Traveller, Queuing_Times, Terminal_Seating, Terminal_Signs, Airport_shopping, Experience_At_Airport

In [8]:
def notation(soup: str, nb:int):
    note = []
    for span in soup.findAll('article', attrs={'itemprop': 'review'}):
        dat = str(recovTextBetweenTags(str(span.findAll('time', attrs={
                  'itemprop': 'datePublished'})), ',')).replace("['[", '').replace("]']", '')
        dat = (format_date(dat))
        
        if (dat) > (datetime.now() - timedelta(nb)):
            tab_not = span.findAll('span', attrs={'class': 'star fill'})
            notation_categ = re.findall(r'[0-9]', str(tab_not))
            if len(notation_categ) > 0:
                noteUser = []
                len_not_categ = len(notation_categ)
                for i in range(0, len_not_categ-1):
                    if notation_categ[i] >= notation_categ[i+1]:
                        noteUser.append(notation_categ[i])

                noteUser.append(notation_categ[len_not_categ-1])
                note.append(noteUser)
    
    return note

In [109]:
def title_comm(soup: str, nb:int):
    title = []
    for span in soup.findAll('article', attrs={'itemprop': 'review'}):
        dat = str(recovTextBetweenTags(str(span.findAll('time', attrs={
                  'itemprop': 'datePublished'})), ',')).replace("['[", '').replace("]']", '')
        dat = (format_date(dat))
        if (dat) > (datetime.now() - timedelta(nb)):
            top = span.findAll('h2', attrs={'class': 'text_header'})
            top = translate(recovTextBetweenTags(str(top), 'non'))
            title.append(top[0][1:len(top[0])])

    return title

In [10]:
def dateReview(soup: str, nb:int):
    dateR = []
    for span in soup.findAll('article', attrs={'itemprop': 'review'}):
        dat = str(recovTextBetweenTags(str(span.findAll('time', attrs={
                  'itemprop': 'datePublished'})), ',')).replace("['[", '').replace("]']", '')
        dat = (format_date(dat))

        if (dat) > (datetime.now() - timedelta(nb)):
            top = span.findAll('time', attrs={'itemprop': 'datePublished'})
            dateR.append(recovTextBetweenTags(str(top), ','))

    return dateR

In [11]:
def NoteGlobal(soup: str, nb: int):
    notGlo = []
    for span in soup.findAll('article', attrs={'itemprop': 'review'}):
        dat = str(recovTextBetweenTags(str(span.findAll('time', attrs={
                  'itemprop': 'datePublished'})), ',')).replace("['[", '').replace("]']", '')
        dat = (format_date(dat))
        if (dat) > (datetime.now() - timedelta(nb)):
            top = span.findAll('span', attrs={'itemprop': 'ratingValue'})
            notGlo.append(recovTextBetweenTags(str(top), ','))

    return notGlo

In [108]:
def description(soup: str, nb:int):
    desc = []
    for span in soup.findAll('article', attrs={'itemprop': 'review'}):
        dat = str(recovTextBetweenTags(str(span.findAll('time', attrs={
                  'itemprop': 'datePublished'})), ',')).replace("['[", '').replace("]']", '')
        dat = (format_date(dat))
        if (dat) > (datetime.now() - timedelta(nb)):
            top = span.findAll('div', attrs={'class': 'text_content'})
            desc.append(translate(recovTextBetweenTags(str(top), ',')))

    return desc

In [13]:
def notation2(soup: str, nb:int):
    note = []
    for span in soup.findAll('article', attrs={'itemprop': 'review'}):
        
        dat = str(recovTextBetweenTags(str(span.findAll('time', attrs={
                  'itemprop': 'datePublished'})), ',')).replace("['[", '').replace("]']", '')
        dat = (format_date(dat))
        if (dat) > (datetime.now() - timedelta(nb)):
            
            not_tot_tab = span.findAll('table', attrs={'class': 'review-ratings'})
            not_tot_tab = (recovTextBetweenTags(str(not_tot_tab), ','))
            note.append(str(str(not_tot_tab).replace(
                '\\n', '').replace('\\', '')).split('  '))

    Rating = []
    for elem in note:
        if len(elem) != 0:
            Rating.append(elem)

    return Rating

In [14]:
def Nb_pages(soup: str):
    nb_page_total = soup.find('div', attrs={'class': 'pagination-total'})
    if nb_page_total != None:
        nb_page_total = str(nb_page_total)
        nb_pages = int(nb_page_total[41:len(nb_page_total)-14])//10 + 1
    else:
        nb_pages = 1
    return(nb_pages)

### Launching scrapping + json recording

In [104]:
def addJSON(file: str, df, creat: bool):

    if creat is False :
        with open(file) as train_file:
            dict_train = json.load(train_file)
        data = pd.read_json(dict_train, orient="records")
        df = pd.concat([data, df])
    
    js = df.to_json(orient='records').replace(
        "[\\\"[", '').replace("]\\\"]", '')
    
    with open(file, 'w', encoding='utf8') as outfile:
        json.dump(js, outfile, ensure_ascii=False, indent=4)

In [117]:
# Starting the scrap function
df = scrap(dice, 20000)
# Transformation and Complete of Json
addJSON(file, df, True)

Empty DataFrame
Columns: [Aircraft_Type, Airline_Name, Airline_Type, Airport, Airport_Shopping, Aisle_Space, Arrival_city, Awards, Bar_And_Beverages, Cabin_Class, Cabin_Staff_Service, Category, Category_Detail, Catering, Cleanliness, Date_Flown, Date_Review, Date_Visit, Departure_city, Duration, Experience_At_Airport, Food_And_Beverages, Ground_Service, Inflight_Entertainment, Language, Lounge_Comfort, Lounge_Staff_Service, Nb_bus_taken, Nb_car_taken, Nb_plane_taken, Nb_sharing, Nb_train_taken, Overall_Airline_Rating, Overall_Customer_Rating, Overall_Service_Rating, Power_Supply, Price_max, Price_min, Queuing Times, Queuing_Times, Recommended, Region_Operation, Registration, Review, Route, Seat, Seat_Comfort, Seat_Legroom, Seat_Recline, Seat_Storage, Seat_Width, Seat_type, Terminal Signs, Terminal_Seating, Terminal_Signs, Title, Type_Of_Lounge, Type_Of_Traveller, Value_For_Money, Viewing_Tv_Screen, Washrooms, Wifi_And_Connectivity]
Index: []

[0 rows x 62 columns]


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


