<a href="https://colab.research.google.com/github/RitikArora24/BritishAirways-project-with-webScraping/blob/main/Web_scraping_beautifulsoup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data Collection

In this phase we will collect the customer ratings data from the airline quality website called [Skytrax](https://www.airlinequality.com/airline-reviews/british-airways). We will collect data about airline ratings, seat ratings and lounge experience ratings from this website.

In [1]:
#imports

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [2]:
#create an empty list to collect all reviews
reviews  = []

#create an empty list to collect rating stars
stars = []

#create an empty list to collect date
date = []

#create an empty list to collect country the reviewer is from
country = []

In [3]:
for i in range(1, 36):
    page = requests.get(f"https://www.airlinequality.com/airline-reviews/british-airways/page/{i}/?sortby=post_date%3ADesc&pagesize=100")

    soup = BeautifulSoup(page.content, "html5")

    for item in soup.find_all("div", class_="text_content"):
        reviews.append(item.text)

    for item in soup.find_all("div", class_ = "rating-10"):
        try:
            stars.append(item.span.text)
        except:
            print(f"Error on page {i}")
            stars.append("None")

    #date
    for item in soup.find_all("time"):
        date.append(item.text)

    #country
    for item in soup.find_all("h3"):
        country.append(item.span.next_sibling.text.strip(" ()"))

Error on page 31
Error on page 33
Error on page 33
Error on page 35


In [4]:
#check the length of total reviews extracted
len(reviews)

3500

In [5]:
len(country)

3500

In [6]:
len(stars)

3535

In [7]:
#check the length
stars = stars[:3500]

In [8]:
#create  a dataframe from these collected lists of data

df = pd.DataFrame({"reviews":reviews,"stars": stars, "date":date, "country": country})

In [9]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,✅ Trip Verified | The worst airline I have e...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,7th October 2023,Australia
1,"✅ Trip Verified | Excellent service levels, ...",1,7th October 2023,United Kingdom
2,Not Verified | Booked a very special holiday ...,10,5th October 2023,United Kingdom
3,"Not Verified | Just returned from Chicago, fle...",1,3rd October 2023,United Kingdom
4,✅ Trip Verified | BA standards continue to de...,2,2nd October 2023,United Kingdom


In [10]:
df.shape

(3500, 4)

### Export the data into a csv format

In [11]:
import os

cwd = os.getcwd()
df.to_csv(cwd+ "/BA_reviews.csv")

#Data Cleaning
Now since we have extracted data from the website, it is not cleaned and ready to be analyzed yet. The reviews section will need to be cleaned for punctuations, spellings and other characters.

In [12]:
#imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#regex
import re

In [13]:
#create a dataframe from csv file

cwd = os.getcwd()

df = pd.read_csv(cwd+"/BA_reviews.csv", index_col=0)

In [14]:
df.head()


Unnamed: 0,reviews,stars,date,country
0,✅ Trip Verified | The worst airline I have e...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,7th October 2023,Australia
1,"✅ Trip Verified | Excellent service levels, ...",1,7th October 2023,United Kingdom
2,Not Verified | Booked a very special holiday ...,10,5th October 2023,United Kingdom
3,"Not Verified | Just returned from Chicago, fle...",1,3rd October 2023,United Kingdom
4,✅ Trip Verified | BA standards continue to de...,2,2nd October 2023,United Kingdom


In [15]:
df['verified'] = df.reviews.str.contains("Trip Verified")
df['verified']

0        True
1        True
2       False
3       False
4        True
        ...  
3495    False
3496    False
3497    False
3498    False
3499    False
Name: verified, Length: 3500, dtype: bool

#Cleaning Reviews
We will extract the column of reviews into a separate dataframe and clean it for semantic analysis

In [18]:
#for lemmatization of words we will use nltk library
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
lemma = WordNetLemmatizer()
nltk.download('wordnet')


reviews_data = df.reviews.str.strip("✅ Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus =[]

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [19]:
# add the corpus to the original dataframe

df['corpus'] = corpus

In [20]:
df.head()


Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | The worst airline I have e...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,7th October 2023,Australia,True,worst airline ever flown allocated back row ne...
1,"✅ Trip Verified | Excellent service levels, ...",1,7th October 2023,United Kingdom,True,excellent service level proactive crew superb ...
2,Not Verified | Booked a very special holiday ...,10,5th October 2023,United Kingdom,False,verified booked special holiday partner septem...
3,"Not Verified | Just returned from Chicago, fle...",1,3rd October 2023,United Kingdom,False,verified returned chicago flew day ago america...
4,✅ Trip Verified | BA standards continue to de...,2,2nd October 2023,United Kingdom,True,ba standard continue decline every time fly ti...


#Cleaning/Fromat date

In [21]:
df.dtypes

reviews     object
stars       object
date        object
country     object
verified      bool
corpus      object
dtype: object

In [22]:
# convert the date to datetime format

df.date = pd.to_datetime(df.date)