# Task 1

---

## Web scraping and analysis


In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 15
page_size = 200

reviews = []

for i in range(1, pages + 1):

    print(f"Scraping page {i}")

    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    response = requests.get(url)

    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    for para in parsed_content.find_all("div", {"class": "text_content"}):
        reviews.append(para.get_text())

    print(f"   ---> {len(reviews)} total reviews")

Scraping page 1
   ---> 200 total reviews
Scraping page 2
   ---> 400 total reviews
Scraping page 3
   ---> 600 total reviews
Scraping page 4
   ---> 800 total reviews
Scraping page 5
   ---> 1000 total reviews
Scraping page 6
   ---> 1200 total reviews
Scraping page 7
   ---> 1400 total reviews
Scraping page 8
   ---> 1600 total reviews
Scraping page 9
   ---> 1800 total reviews
Scraping page 10
   ---> 2000 total reviews
Scraping page 11
   ---> 2200 total reviews
Scraping page 12
   ---> 2400 total reviews
Scraping page 13
   ---> 2600 total reviews
Scraping page 14
   ---> 2800 total reviews
Scraping page 15
   ---> 3000 total reviews


In [None]:
data = pd.DataFrame()
data["reviews"] = reviews
data.head()

Unnamed: 0,reviews
0,✅ Trip Verified | Boarding was difficult caus...
1,✅ Trip Verified | Boarding started with a del...
2,✅ Trip Verified | Absolutely horrible custome...
3,Not Verified | BA is not what it used to be! ...
4,"✅ Trip Verified | BA First, it's not even the..."


In [None]:
data.to_csv("/content/drive/MyDrive/Forage /British_Airways/Scrapped_data.csv", header = 'true')

## Data Manipulation

In [None]:
data.head(10)

Unnamed: 0,reviews
0,✅ Trip Verified | Boarding was difficult caus...
1,✅ Trip Verified | Boarding started with a del...
2,✅ Trip Verified | Absolutely horrible custome...
3,Not Verified | BA is not what it used to be! ...
4,"✅ Trip Verified | BA First, it's not even the..."
5,✅ Trip Verified | The worst business class ex...
6,Not Verified | Quite possibly the worst busin...
7,Not Verified | I will never be flying with BA...
8,✅ Trip Verified | On the my trip to Mexico Ci...
9,✅ Trip Verified | I upgraded at check in to C...


In [None]:
data.columns

Index(['reviews'], dtype='object')

In [None]:
import nltk
from nltk import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.corpus import stopwords
import string

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Tokenize the words and remove stopwords

In [None]:
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import string
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.DataFrame(data)

def preprocess_text(review):
    tokens = word_tokenize(review)
    stop_words = set(stopwords.words('english'))
    tokens = [word.lower() for word in tokens if (word.isalnum() and word.lower() not in stop_words)]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply tokenization to the 'reviews' column
df['Tokenized Reviews'] = df['reviews'].apply(preprocess_text)

# Display the DataFrame with tokenized reviews
print(df[['reviews', 'Tokenized Reviews']])


                                                reviews  \
0     ✅ Trip Verified |  Boarding was difficult caus...   
1     ✅ Trip Verified |  Boarding started with a del...   
2     ✅ Trip Verified |  Absolutely horrible custome...   
3     Not Verified |  BA is not what it used to be! ...   
4     ✅ Trip Verified |  BA First, it's not even the...   
...                                                 ...   
2995  Club World from LHR-ORD. Busy flight on the ag...   
2996  This review refers to a return trip from India...   
2997  This review concerns LHR-BOM in Business Class...   
2998  Family of 3 - 747 out (upper deck) 777 back. F...   
2999  Travelled from Venice to Gatwick on 28th May c...   

                                      Tokenized Reviews  
0     trip verified boarding difficult caused vast m...  
1     trip verified boarding started delay 20 minute...  
2     trip verified absolutely horrible customer ser...  
3     verified ba used much like onboard crew check ...  
4

In [None]:
df.head(2)

Unnamed: 0,reviews,Tokenized Reviews
0,✅ Trip Verified | Boarding was difficult caus...,trip verified boarding difficult caused vast m...
1,✅ Trip Verified | Boarding started with a del...,trip verified boarding started delay 20 minute...


### Tokenized Data for the reviews

In [None]:
df["Tokenized Reviews"][0]

'trip verified boarding difficult caused vast majority passenger carrying much hand luggage fa friendly seat ba european flight extremely narrow choice breakfast surprising champagne castelau european flight better quality brand used club intercontinental flight nothing wrong flight however pleasant due unpleasant seat waiting time brussels luggage 20 minute acceptable'

In [None]:
df["Tokenized Reviews"][1]

'trip verified boarding started delay 20 minute everybody could see member team arrive 5 en 3 minute announced boarding time wait outside nearly 15 minute hot unfortunately flying barbados london club class underwhelming experience ba performing great performance international flight anymore lucky 777 new configuration welcome aboard champagne fruit juice glass filled level one inch far cheaper brand champagne heidsieck monopole rosé champagne available wine also poor french wine cognac also available anymore meal sub par steak replaced beef stew fa made clear dessert cheese 4 hour flight ice available minimal information flight deck breakfast suitable business class fa friendly others far pleasant experience despite high fare'

## SentimentIntensityAnalyzer is inbuilt library for the sentiment analysis <br> gives positive and negative answer

In [None]:
import nltk
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
sid = SentimentIntensityAnalyzer()
df['Sentiment Score'] = df['reviews'].apply(lambda x: sid.polarity_scores(x)['compound'])

# Assign sentiment labels
df['Sentiment Label'] = df['Sentiment Score'].apply(lambda score: 'Positive' if score >= 0.05 else ('Negative' if score <= -0.05 else 'Neutral'))


In [None]:
df["Sentiment Label"].head(20)

0     Positive
1     Positive
2     Negative
3     Positive
4     Negative
5     Negative
6     Positive
7     Negative
8     Positive
9     Positive
10    Negative
11    Negative
12    Negative
13    Negative
14    Negative
15    Negative
16     Neutral
17    Positive
18    Positive
19    Positive
Name: Sentiment Label, dtype: object

In [None]:
df.head(10)

Unnamed: 0,reviews,Tokenized Reviews,Sentiment Score,Sentiment Label
0,✅ Trip Verified | Boarding was difficult caus...,trip verified boarding difficult caused vast m...,0.875,Positive
1,✅ Trip Verified | Boarding started with a del...,trip verified boarding started delay 20 minute...,0.6987,Positive
2,✅ Trip Verified | Absolutely horrible custome...,trip verified absolutely horrible customer ser...,-0.8908,Negative
3,Not Verified | BA is not what it used to be! ...,verified ba used much like onboard crew check ...,0.949,Positive
4,"✅ Trip Verified | BA First, it's not even the...",trip verified ba first even best business clas...,-0.6992,Negative
5,✅ Trip Verified | The worst business class ex...,trip verified worst business class experience ...,-0.5307,Negative
6,Not Verified | Quite possibly the worst busin...,verified quite possibly worst business class e...,0.8799,Positive
7,Not Verified | I will never be flying with BA...,verified never flying ba first last flying sin...,-0.9674,Negative
8,✅ Trip Verified | On the my trip to Mexico Ci...,trip verified trip mexico city opportunity exp...,0.4645,Positive
9,✅ Trip Verified | I upgraded at check in to C...,trip verified upgraded check club europe seat ...,0.9844,Positive


In [None]:
count = (df["Sentiment Label"] == "Negative").sum()

print("Number of Negative Reviews:", count)
print("Number of Positive Reviews:", 3000 - count)



Number of Negative Reviews: 1313
Number of Positive Reviews: 1687


In [None]:
model_df = df[["reviews", "Sentiment Label"]]

In [None]:
model_df.head(10)

Unnamed: 0,reviews,Sentiment Label
0,✅ Trip Verified | Boarding was difficult caus...,Positive
1,✅ Trip Verified | Boarding started with a del...,Positive
2,✅ Trip Verified | Absolutely horrible custome...,Negative
3,Not Verified | BA is not what it used to be! ...,Positive
4,"✅ Trip Verified | BA First, it's not even the...",Negative
5,✅ Trip Verified | The worst business class ex...,Negative
6,Not Verified | Quite possibly the worst busin...,Positive
7,Not Verified | I will never be flying with BA...,Negative
8,✅ Trip Verified | On the my trip to Mexico Ci...,Positive
9,✅ Trip Verified | I upgraded at check in to C...,Positive


# Model Training and Vectorizing and calcualating Accuracy

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [None]:

X_train, X_test, y_train, y_test = train_test_split(df['Tokenized Reviews'], df['Sentiment Label'], test_size=0.2, random_state=42)


In [None]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
X_train_tfidf

<2400x10434 sparse matrix of type '<class 'numpy.float64'>'
	with 167620 stored elements in Compressed Sparse Row format>

In [None]:
X_test_tfidf

<600x10434 sparse matrix of type '<class 'numpy.float64'>'
	with 40158 stored elements in Compressed Sparse Row format>

In [None]:
# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

In [None]:
y_pred = classifier.predict(X_test_tfidf)

In [None]:
y_pred

array(['Positive', 'Positive', 'Negative', 'Positive', 'Positive',
       'Positive', 'Negative', 'Positive', 'Positive', 'Positive',
       'Positive', 'Negative', 'Negative', 'Positive', 'Positive',
       'Negative', 'Positive', 'Positive', 'Negative', 'Positive',
       'Positive', 'Negative', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Negative',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Negative', 'Positive', 'Positive', 'Negative', 'Positive',
       'Negative', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Negative', 'Negative', 'Positive', 'Positive',
       'Positive', 'Negative', 'Positive', 'Positive', 'Negative',
       'Negative', 'Positive', 'Positive', 'Negative', 'Positive',
       'Positive', 'Positive', 'Positive', 'Negative', 'Positive',
       'Positive', 'Positive', 'Negative', 'Positive', 'Negati

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred) *100, "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 72.0 %

Classification Report:
               precision    recall  f1-score   support

    Negative       0.80      0.52      0.63       264
     Neutral       0.00      0.00      0.00        13
    Positive       0.69      0.91      0.78       323

    accuracy                           0.72       600
   macro avg       0.50      0.48      0.47       600
weighted avg       0.72      0.72      0.70       600



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Testing for a new review and the model gives proper output

In [None]:
test_data = [
    'This flight was excellent! The service was outstanding.',
    'I had a terrible experience. The flight was delayed and the staff was rude.',
    'The journey was okay, nothing bad.'
]

# Preprocess the test data
preprocessed_test_data = [preprocess_text(review) for review in test_data]

X_test_tfidf = vectorizer.transform(preprocessed_test_data)

predicted_labels = classifier.predict(X_test_tfidf)

# Printing the output
for review, label in zip(test_data, predicted_labels):
    print(f"Review: {review} -->    Predicted Label: {label}")
    print("\n")

Review: This flight was excellent! The service was outstanding. -->    Predicted Label: Positive


Review: I had a terrible experience. The flight was delayed and the staff was rude. -->    Predicted Label: Negative


Review: The journey was okay, nothing bad. -->    Predicted Label: Positive


