## Task 1 : Web Scraping 

In [3]:
#importing necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [4]:
#saving the link in URL
url = 'https://www.airlinequality.com/airline-reviews/british-airways'

In [5]:
url

'https://www.airlinequality.com/airline-reviews/british-airways'

In [6]:
#checking if the url is working fine or not
page = requests.get(url)

In [7]:
page

<Response [200]>

In [8]:
soup = BeautifulSoup(page.text,'lxml')
soup

<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7 lt-ie10" lang="en-GB"> <![endif]--><!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8 lt-ie10" lang="en-GB"> <![endif]--><!--[if IE 8]>    <html class="no-js lt-ie9 lt-ie10" lang="en-GB"> <![endif]--><!--[if IE 9]>    <html class="no-js lt-ie10" lang="en-GB"> <![endif]--><!--[if gt IE 8]><!--><html lang="en-GB"> <!--<![endif]-->
<head>
<meta charset="utf-8"/>
<title>British Airways Customer Reviews - SKYTRAX</title>
<!-- Google Chrome Frame for IE -->
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<!-- mobile meta -->
<meta content="True" name="HandheldFriendly"/>
<meta content="320" name="MobileOptimized"/>
<meta content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/>
<!-- icons & favicons -->
<link href="https://www.airlinequality.com/wp-content/themes/airlinequality2014new/library/images/apple-icon-touch.png" rel="apple-touch-ic

In [9]:
reviews = []
for i in range(1,11): #loop will run from 0 to 10, as scraping 10 pages
    url='https://www.airlinequality.com/airline-reviews/british-airways/page/'+str(i)+'/?sortby=post_date%3ADesc&pagesize=100'
    page = requests.get(url)
    parsed_content = BeautifulSoup(page.text, 'html.parser')
    for para in parsed_content.find_all("div", {"class": "text_content"}):
        reviews.append(para.get_text())

In [10]:
len(reviews)

1000

In [11]:
df=pd.DataFrame({'reviews':reviews})

In [12]:
df.head(10)

Unnamed: 0,reviews
0,Not Verified | Food was lousy. Who ever is pl...
1,✅ Trip Verified | Had the worst experience. Th...
2,✅ Trip Verified | The ground staff were not h...
3,✅ Trip Verified | Second time BA Premium Econ...
4,Not Verified | They changed our Flights from ...
5,Not Verified | At Copenhagen the most chaotic...
6,✅ Trip Verified | Worst experience of my life...
7,✅ Trip Verified | Due to code sharing with Ca...
8,✅ Trip Verified | LHR check in was quick at t...
9,✅ Trip Verified | I wouldn't recommend Britis...


In [13]:
df.to_csv(r'C:\Users\PUJA\Desktop\data\BA_reviews1.csv',index=False)

In [14]:
#loading the data now
review_data = pd.read_csv(r'C:\Users\PUJA\Desktop\data\BA_reviews1.csv')
review_data.head()

Unnamed: 0,reviews
0,Not Verified | Food was lousy. Who ever is pl...
1,✅ Trip Verified | Had the worst experience. Th...
2,✅ Trip Verified | The ground staff were not h...
3,✅ Trip Verified | Second time BA Premium Econ...
4,Not Verified | They changed our Flights from ...


In [15]:
#Data preprocessing

df_text = review_data[['reviews']]

df_text['reviews'][0]

'Not Verified |  Food was lousy. Who ever is planning the Asian Hindu Vegetarian meal is clueless as to what this meal includes. The snack was also lousy. It took us 2 hours just to go from T5 to T3 and clear security check. The place was chaotic. We get to our boarding gate and again a long line with 40 minutes wait for someone to check passport and boarding pass. Seats in Premium economy was ok, but when the passenger in front reclines, there is very little space for the window seat passenger to cross over. Overall not a good airline.'

In [31]:

#words_list = ['✅ Trip Verified |','Not Verified |']
#removing unwanted words from all the reviews
#df_text["reviews"] = df_text["reviews"].replace("Not Verified |", " ")

df_text = df_text.drop('split_text', axis = 'columns')

In [32]:
df_text

Unnamed: 0,reviews
0,Not Verified | Food was lousy. Who ever is pl...
1,✅ Trip Verified | Had the worst experience. Th...
2,✅ Trip Verified | The ground staff were not h...
3,✅ Trip Verified | Second time BA Premium Econ...
4,Not Verified | They changed our Flights from ...
...,...
995,✅ Trip Verified | Delhi to London. Having rea...
996,✅ Trip Verified | When you travel British Airw...
997,✅ Trip Verified | British Airways gets plenty...
998,✅ Trip Verified | BA First Class is not even ...


In [35]:
df_text['reviews'] = df_text['reviews'].str.replace('Not Verified','')

In [37]:
df_text['reviews'] = df_text['reviews'].str.replace('✅ Trip Verified','')

  df_text['reviews'] = df_text['reviews'].str.replace('✅ Trip Verified |','')


In [38]:
df_text

Unnamed: 0,reviews
0,| Food was lousy. Who ever is planning the As...
1,| Had the worst experience. The flight from Lo...
2,| The ground staff were not helpful. Felt lik...
3,| Second time BA Premium Economy in a newer a...
4,| They changed our Flights from Brussels to L...
...,...
995,| Delhi to London. Having read many negative ...
996,| When you travel British Airways its like you...
997,| British Airways gets plenty of well deserve...
998,| BA First Class is not even that any more. S...


In [39]:
#Let's work with Sentiment Analysis as it is a review dataset
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [40]:
#creating object

obj = SentimentIntensityAnalyzer()

In [41]:
#creating a lambda function

function = lambda x : obj.polarity_scores(x)['compound']

In [42]:
#lets apply this lambda function on our dataframe

df_text['polarity'] = df_text['reviews'].apply(function)

In [43]:
df_text

Unnamed: 0,reviews,polarity
0,| Food was lousy. Who ever is planning the As...,-0.7881
1,| Had the worst experience. The flight from Lo...,-0.7890
2,| The ground staff were not helpful. Felt lik...,-0.8537
3,| Second time BA Premium Economy in a newer a...,0.9601
4,| They changed our Flights from Brussels to L...,-0.8055
...,...,...
995,| Delhi to London. Having read many negative ...,0.9869
996,| When you travel British Airways its like you...,0.8850
997,| British Airways gets plenty of well deserve...,0.9714
998,| BA First Class is not even that any more. S...,0.7498


In [46]:
#Grouping each review comment into different buckets
import numpy as np

df_text['bucket'] = np.where(df_text['polarity'] > 0.5, 'Positive',
                            np.where((df_text['polarity']>= -0.5) & (df_text['polarity'] <= 0.5),'Neutral',
                                     np.where(df_text['polarity'] <-0.5, 'Negative', 'NA')))

In [47]:
df_text

Unnamed: 0,reviews,polarity,bucket
0,| Food was lousy. Who ever is planning the As...,-0.7881,Negative
1,| Had the worst experience. The flight from Lo...,-0.7890,Negative
2,| The ground staff were not helpful. Felt lik...,-0.8537,Negative
3,| Second time BA Premium Economy in a newer a...,0.9601,Positive
4,| They changed our Flights from Brussels to L...,-0.8055,Negative
...,...,...,...
995,| Delhi to London. Having read many negative ...,0.9869,Positive
996,| When you travel British Airways its like you...,0.8850,Positive
997,| British Airways gets plenty of well deserve...,0.9714,Positive
998,| BA First Class is not even that any more. S...,0.7498,Positive


In [48]:
# finding how many reviews are there in each bucket

df_text.groupby('bucket').size().reset_index()

Unnamed: 0,bucket,0
0,Negative,362
1,Neutral,197
2,Positive,441


In [49]:
# using topic modeling on the extracted data
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation

In [50]:
#loading the actual data again
df = pd.read_csv(r'C:\Users\PUJA\Desktop\data\BA_reviews1.csv')
df.head()

Unnamed: 0,reviews
0,Not Verified | Food was lousy. Who ever is pl...
1,✅ Trip Verified | Had the worst experience. Th...
2,✅ Trip Verified | The ground staff were not h...
3,✅ Trip Verified | Second time BA Premium Econ...
4,Not Verified | They changed our Flights from ...


In [51]:
df['reviews'] = df['reviews'].str.replace('Not Verified','')

In [52]:
df['reviews'] = df['reviews'].str.replace('✅ Trip Verified','')

In [53]:
df.head()

Unnamed: 0,reviews
0,| Food was lousy. Who ever is planning the A...
1,| Had the worst experience. The flight from L...
2,| The ground staff were not helpful. Felt li...
3,| Second time BA Premium Economy in a newer ...
4,| They changed our Flights from Brussels to ...


In [55]:
#data preprocessing
#Converting everything into lower case 
#Removing special characters (anything which is not starting with alphabets)
#Removing punctuation marks
df['clean_text'] = df['reviews'].str.lower().str.replace('[^a-z\']',' ')

  df['clean_text'] = df['reviews'].str.lower().str.replace('[^a-z\']',' ')


In [56]:
df.head()

Unnamed: 0,reviews,clean_text
0,| Food was lousy. Who ever is planning the A...,food was lousy who ever is planning the a...
1,| Had the worst experience. The flight from L...,had the worst experience the flight from l...
2,| The ground staff were not helpful. Felt li...,the ground staff were not helpful felt li...
3,| Second time BA Premium Economy in a newer ...,second time ba premium economy in a newer ...
4,| They changed our Flights from Brussels to ...,they changed our flights from brussels to ...


In [58]:
#removing stop words (E.g. conjunction - i, is, are and so on)
stop = stopwords.words('english')

In [59]:
stop

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [60]:
#defining user defined function
def sw(x):         # Lets give it a name 'sw', inside we have given an argument (x)
    x = [word for word in x.split() if word not in stop]
    return " ".join(x)     # To get the output as a sentence after removing the stop words

df['split_text'] = df['clean_text'].apply(sw)

In [63]:
df['split_text'][0]

'food lousy ever planning asian hindu vegetarian meal clueless meal includes snack also lousy took us hours go clear security check place chaotic get boarding gate long line minutes wait someone check passport boarding pass seats premium economy ok passenger front reclines little space window seat passenger cross overall good airline'

In [64]:
#Creating the tfidf vectorizer object for our DTM

tfidf_vec = TfidfVectorizer()

# fitting this object on split_text column
tfidf_vec.fit(df['split_text'])

# create DTM
X = tfidf_vec.fit_transform(df['split_text'])
X


<1000x7128 sparse matrix of type '<class 'numpy.float64'>'
	with 60954 stored elements in Compressed Sparse Row format>

In [65]:
#building LDA model which will divide these topics into 5 topics

lda_model = LatentDirichletAllocation(n_components = 5)

#fit the model on our DTM (X)

lda_output = lda_model.fit_transform(X)

lda_output # This is the LDA model

array([[0.02791268, 0.02791242, 0.88834948, 0.02791278, 0.02791265],
       [0.02761532, 0.02761498, 0.88926223, 0.02789219, 0.02761528],
       [0.03082604, 0.03082554, 0.87669712, 0.03082564, 0.03082567],
       ...,
       [0.03200027, 0.03199971, 0.72027998, 0.18329121, 0.03242884],
       [0.02014074, 0.02013987, 0.91943823, 0.02014048, 0.02014068],
       [0.02155839, 0.0215623 , 0.91376256, 0.02155846, 0.02155829]])

In [66]:
#Convert the lda_output (in form of array) into a dataframe
# Rows ---> it represents documents
# columns ---> it represents topics

# we will also round off the probability scores

import numpy as np

df_doc_topic = pd.DataFrame(np.round(lda_output,2))

df_doc_topic

# we would want to add a prefix for our column names and row index

topicname = ['topic '+str(i) for i in range(lda_model.n_components)]
topicname

docname = ['Doc '+str(i) for i in range(len(df))]
docname

# we will use the above created topicname and docname inside our dataframe

df_doc_topic = pd.DataFrame(np.round(lda_output,2), columns = topicname, index=docname)

In [67]:
df_doc_topic

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4
Doc 0,0.03,0.03,0.89,0.03,0.03
Doc 1,0.03,0.03,0.89,0.03,0.03
Doc 2,0.03,0.03,0.88,0.03,0.03
Doc 3,0.03,0.03,0.89,0.03,0.03
Doc 4,0.03,0.03,0.89,0.03,0.03
...,...,...,...,...,...
Doc 995,0.02,0.02,0.92,0.02,0.02
Doc 996,0.02,0.02,0.92,0.02,0.02
Doc 997,0.03,0.03,0.72,0.18,0.03
Doc 998,0.02,0.02,0.92,0.02,0.02


In [68]:
## argmax command will be used to identify the max value of probability for each row/document

dominate_topic = np.argmax(df_doc_topic.values,axis = 1) # axis =1 is used for checking row-wise
dominate_topic

# we will add one more column in our dataframe for storing the dominating topic

df_doc_topic['domin_topic'] = dominate_topic

df_doc_topic

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,domin_topic
Doc 0,0.03,0.03,0.89,0.03,0.03,2
Doc 1,0.03,0.03,0.89,0.03,0.03,2
Doc 2,0.03,0.03,0.88,0.03,0.03,2
Doc 3,0.03,0.03,0.89,0.03,0.03,2
Doc 4,0.03,0.03,0.89,0.03,0.03,2
...,...,...,...,...,...,...
Doc 995,0.02,0.02,0.92,0.02,0.02,2
Doc 996,0.02,0.02,0.92,0.02,0.02,2
Doc 997,0.03,0.03,0.72,0.18,0.03,2
Doc 998,0.02,0.02,0.92,0.02,0.02,2


In [69]:
#Creating 2nd matrix (Topic (5) Vs Words)

df_tw = pd.DataFrame(lda_model.components_,columns = tfidf_vec.get_feature_names(), index = topicname)

df_tw



Unnamed: 0,aa,abandon,abandoned,aberdeen,ability,able,abnormally,aboard,abreakfast,abreast,...,zhr,zip,zone,zones,zoo,zrh,zuletzt,zum,zurich,zusammenschluss
topic 0,0.200031,0.323582,0.200042,0.200034,0.20003,0.20004,0.200052,0.200038,0.20007,0.200031,...,0.200053,0.200047,0.20003,0.200035,0.200042,0.200067,0.200014,0.200014,0.20003,0.200014
topic 1,0.200698,0.200082,0.200041,0.200034,0.200029,0.200022,0.200051,0.200038,0.200068,0.20003,...,0.200052,0.211757,0.200029,0.200035,0.200041,0.698735,0.262005,0.262005,0.200029,0.262005
topic 2,1.369337,0.226492,0.487818,1.06867,0.830526,5.34496,0.339018,1.022024,0.298044,0.519161,...,0.330222,0.574856,0.988339,0.711819,0.290603,0.214296,0.200682,0.200682,1.566048,0.200682
topic 3,0.200031,0.200083,0.200042,0.200034,0.20003,0.200022,0.200052,0.200038,0.200069,0.200031,...,0.200052,0.200046,0.20003,0.200035,0.200042,0.200067,0.200014,0.200014,0.20003,0.200014
topic 4,0.200031,0.200083,0.200042,0.200034,0.20003,0.200073,0.200052,0.200038,0.200069,0.200031,...,0.200052,0.200046,0.20003,0.200035,0.200042,0.200067,0.200014,0.200014,0.20003,0.200014


In [70]:
#creating user defined function for arranging the important keywords across each topic from highest to lowest value
def show_topics(vectorizer, model, n_words):
    #Create a array of words
    keywords = np.array(tfidf_vec.get_feature_names())
    # Create an empty list
    topic_keywords = []
    for  topic_weight in lda_model.components_:
        # for each topic we will arrange the word/terms in descending order of tf-idf score
        top_keyword_loc = (-topic_weight).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_loc))
        
    return topic_keywords

In [71]:
tfidf_vec.get_feature_names()



['aa',
 'abandon',
 'abandoned',
 'aberdeen',
 'ability',
 'able',
 'abnormally',
 'aboard',
 'abreakfast',
 'abreast',
 'abroad',
 'abrupt',
 'absent',
 'absolute',
 'absolutely',
 'absorbed',
 'abu',
 'abuja',
 'abundant',
 'abusive',
 'abysmal',
 'ac',
 'accent',
 'accents',
 'accept',
 'acceptable',
 'accepted',
 'accepting',
 'accepts',
 'access',
 'accessible',
 'accessing',
 'accommodate',
 'accommodating',
 'accommodation',
 'accommodations',
 'accompanied',
 'accompanies',
 'accompany',
 'accomplish',
 'accomplishing',
 'according',
 'accordingly',
 'account',
 'accountability',
 'accountants',
 'accra',
 'accumulated',
 'accurate',
 'accuse',
 'accused',
 'accustomed',
 'ache',
 'achieve',
 'achieved',
 'achievement',
 'achive',
 'acknowledge',
 'acknowledged',
 'acknowledgement',
 'acknowledging',
 'acoustics',
 'across',
 'act',
 'acted',
 'action',
 'active',
 'actively',
 'activity',
 'actors',
 'actual',
 'actually',
 'ad',
 'adamant',
 'adaptive',
 'add',
 'added',
 'ad

In [72]:
topic_keywords = show_topics(vectorizer = tfidf_vec, model=lda_model, n_words=10)

topic_keywords

[array(['stuttgart', 'sardinia', 'strolled', 'stammers', 'applying',
        'lanyard', 'risked', 'julie', 'degree', 'outraged'], dtype='<U18'),
 array(['fco', 'greatly', 'zrh', 'diverted', 'wir', 'alex', 'cruz', 'und',
        'newest', 'cleanest'], dtype='<U18'),
 array(['flight', 'ba', 'service', 'london', 'seat', 'good', 'crew',
        'food', 'class', 'time'], dtype='<U18'),
 array(['gibraltar', 'favorite', 'bubbly', 'aged', 'clubs', 'heating',
        'tikka', 'masala', 'dispensable', 'commodity'], dtype='<U18'),
 array(['nashville', 'gibraltar', 'alaska', 'edreams', 'disappointments',
        'frustratingly', 'unload', 'kahina', 'ladjouze', 'closures'],
       dtype='<U18')]