### Import libraries and load data

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
# import nltk

In [2]:
data = pd.read_csv('data/BA_reviews.csv')

In [3]:
data.head()

Unnamed: 0,reviews
0,✅ Trip Verified | Worst experience of my life...
1,✅ Trip Verified | Due to code sharing with Ca...
2,✅ Trip Verified | LHR check in was quick at t...
3,✅ Trip Verified | I wouldn't recommend Britis...
4,✅ Trip Verified | Absolutely horrible experie...


In [4]:
data.describe()

Unnamed: 0,reviews
count,3529
unique,3509
top,This was the last flight in a One World around...
freq,2


### Data Cleaning 

In [5]:
# check for and remove duplicate rows
duplicates = data[data.duplicated(keep=False)]
print(duplicates)
duplicates = data.duplicated(subset=['reviews'])
print("Number of duplicates: ", duplicates.sum())
data.drop_duplicates(subset=['reviews'], inplace=True)

                                                reviews
2357  British Airways from Tampa to Gatwick on Boein...
2358  London Heathrow to Miami on one of British Air...
2385  British Airways from Tampa to Gatwick on Boein...
2387  London Heathrow to Miami on one of British Air...
2797  Travelled with my wife from Barbados to London...
2798  LGW-AGP Club Europe. Ground staff were very ch...
2800  Travelled with my wife from Barbados to London...
2801  LGW-AGP Club Europe. Ground staff were very ch...
2898  This was the last flight in a One World around...
2899  London Gatwick (LGW) to Lanzarote (ACE). We ha...
2900  This was the last flight in a One World around...
2901  London Gatwick (LGW) to Lanzarote (ACE). We ha...
2994  LGW-CUN Club World seat 4A. To be honest I was...
2995  Recently completed a round-trip LGW-Barcelona....
2996  LHR-HKG-LHR in First. A brief trip to celebrat...
2997  BA179 Jan 6 2015 LHR-JFK. First. T5 is great f...
2998  London-Gatwick to Venice on 30 December 20

In [6]:
# nltk.download('stopwords')

In [7]:
df = pd.DataFrame()
# Convert text to lowercase
df['reviews'] = data['reviews'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# Remove stop words
stop = stopwords.words('english')
df['reviews'] = df['reviews'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

#remove ( and )
df['reviews'] = [text.replace('(', '').replace(')', '') for text in df.reviews]

df.head()

Unnamed: 0,reviews
0,✅ trip verified | worst experience life trying...
1,✅ trip verified | due code sharing cathay paci...
2,✅ trip verified | lhr check quick first wing q...
3,✅ trip verified | recommend british airways al...
4,✅ trip verified | absolutely horrible experien...


#### All Reviews

In [8]:
# remove unwanted ✅ Trip Verified | text
all_reviews = [review.split('|')[-1].strip() for review in df.reviews]

In [9]:
all_reviews

['worst experience life trying deal customer service british airways. many issues getting one destination other. absolutely horrible time trying get answers reach anyone capable helping navigate uncertainties. wrote formal complaint requesting type refund received generic “we sorry” email 5000 avios attached equivalent maybe $50 $600+ flight. needless say go waste refuse fly untrustworthy incompetent unorganized airline couldn’t even address one single issue expressed them. equivalent handing band aid slapping face. say flight smooth onboard staff kind attentive. frustrating thing customer service trained give runaround heavily point exhaustion give trying take loss.',
 'due code sharing cathay pacific downgraded ba return leg. can’t describe cheated felt. booked paid airline great entertainment, food service, instead, forced go hungry one choice meal inedible. ensure passengers didn’t get funny ideas assume vegetarian option unavailable might given cracker small packet nuts. cabin cre

In [10]:
# find instances with \r and \n within the text
[text for text in all_reviews if "\n" in text or "\r" in text]

[]

In [11]:
# all_reviews = [text.replace('\n', '').replace('\r', '') for text in all_reviews]

#### Verified Reviews

In [12]:
# remove unwanted ✅ Trip Verified | text
verified = [review.split('|')[1].strip() for review in data.reviews if '✅ Trip Verified' in review]
verified

['Worst experience of my life trying to deal with the customer service for British Airways. So many issues with getting from one destination to the other. Had an absolutely horrible time trying to get answers or reach anyone who was capable of helping navigate uncertainties. I wrote them a formal complaint requesting some type of refund and received a generic “we are sorry” email with 5000 avios attached which is equivalent to maybe $50 on a $600+ flight. Needless to say these will go to waste as I refuse to fly with an untrustworthy incompetent unorganized airline that couldn’t even address one single issue expressed to them. It was equivalent to handing me a band aid after slapping me in the face. I will say the flight itself was smooth and the onboard staff was kind and attentive. The most frustrating thing is that the customer service is trained to give you the runaround so heavily to the point of exhaustion that you just give up trying and take the loss.',
 'Due to code sharing wi

In [13]:
# Convert text to lowercase
verified = [x.lower() for x in verified]

# Remove stop words
stop = stopwords.words('english')
verified = [" ".join(x for x in sentence.split() if x not in stop) for sentence in verified]
verified

['worst experience life trying deal customer service british airways. many issues getting one destination other. absolutely horrible time trying get answers reach anyone capable helping navigate uncertainties. wrote formal complaint requesting type refund received generic “we sorry” email 5000 avios attached equivalent maybe $50 $600+ flight. needless say go waste refuse fly untrustworthy incompetent unorganized airline couldn’t even address one single issue expressed them. equivalent handing band aid slapping face. say flight smooth onboard staff kind attentive. frustrating thing customer service trained give runaround heavily point exhaustion give trying take loss.',
 'due code sharing cathay pacific downgraded ba return leg. can’t describe cheated felt. booked paid airline great entertainment, food service, instead, forced go hungry one choice meal inedible. ensure passengers didn’t get funny ideas assume vegetarian option unavailable might given cracker small packet nuts. cabin cre

In [14]:
# find instances with \r and \n within the text
[text for text in verified if "\n" in text or "\r" in text]

[]

In [15]:
# verified = [text.replace('\n', '').replace('\r', '') for text in verified]

In [16]:
clean_all_reviews = pd.DataFrame({"reviews": all_reviews})
clean_all_reviews.to_csv("data/clean_all_reviews.csv", index=False)

clean_verified_reviews = pd.DataFrame({"reviews": verified})
clean_verified_reviews.to_csv("data/clean_verified_reviews.csv", index=False)