In [1]:
#imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#regex
import re

In [2]:
df = pd.read_csv("BA_airways.csv", index_col=0)

In [3]:

df.head()

Unnamed: 0,reviews,stars,date,country
0,"Not Verified | Before my flight, I was forced ...",5.0,15th August 2024,United Kingdom
1,✅ Trip Verified | British Airways at its bes...,1.0,12th August 2024,United Kingdom
2,✅ Trip Verified | An excellent flight! Despite...,8.0,12th August 2024,Lebanon
3,✅ Trip Verified | I recently traveled with Bri...,8.0,11th August 2024,United States
4,✅ Trip Verified | My family and I were booke...,3.0,9th August 2024,United Kingdom


In [4]:
df['verified'] = df.reviews.str.contains("Trip Verified")

In [5]:
df['verified']

0       False
1        True
2        True
3        True
4        True
        ...  
3495    False
3496    False
3497    False
3498    False
3499    False
Name: verified, Length: 3500, dtype: bool

In [8]:
import string
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

In [13]:
# Start by cleaning up the series
reviews_data = df['reviews'].str.strip("✅ Trip Verified |")

corpus = []

# Loop through each review
for review in reviews_data:
    # Perform string replacements directly on the review (which is a string)
    review = review.replace('Trip Verified |', '')
    review = review.replace('✅', '')
    review = review.replace('|', '')
    # Use regular expressions to remove short words (length <= 3)
    review = re.sub(r'\b\w{1,3}\b', '', review)
    # Apply the punctuation removal function
    review = remove_punctuations(review)
    # Clean up extra spaces
    review = " ".join(review.split())
    # Append the cleaned review to the corpus
    corpus.append(review)

# Convert the corpus into a DataFrame or use it further in your analysis
corpus_df = pd.DataFrame(corpus, columns=['cleaned_reviews'])
corpus_df.head()


Unnamed: 0,cleaned_reviews
0,Verified Before flight forced ground staff che...
1,British Airways best Outstanding service from ...
2,excellent flight Despite this being hour fligh...
3,recently traveled with British Airways mixed e...
4,family were booked leave London Hong Kong Augu...


In [15]:

df['cleaned_reviews'] = corpus

In [16]:

df.head()

Unnamed: 0,reviews,stars,date,country,verified,cleaned_reviews
0,"Not Verified | Before my flight, I was forced ...",5.0,15th August 2024,United Kingdom,False,Verified Before flight forced ground staff che...
1,✅ Trip Verified | British Airways at its bes...,1.0,12th August 2024,United Kingdom,True,British Airways best Outstanding service from ...
2,✅ Trip Verified | An excellent flight! Despite...,8.0,12th August 2024,Lebanon,True,excellent flight Despite this being hour fligh...
3,✅ Trip Verified | I recently traveled with Bri...,8.0,11th August 2024,United States,True,recently traveled with British Airways mixed e...
4,✅ Trip Verified | My family and I were booke...,3.0,9th August 2024,United Kingdom,True,family were booked leave London Hong Kong Augu...


In [17]:

df.dtypes

reviews             object
stars              float64
date                object
country             object
verified              bool
cleaned_reviews     object
dtype: object

In [19]:
# Function to remove ordinal indicators (st, nd, rd, th) from the date string
def remove_ordinal_indicators(date_str):
    return re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)

In [20]:
# Apply the function to the date column
df['date'] = df['date'].apply(remove_ordinal_indicators)

In [21]:
# convert the date to datetime format

df.date = pd.to_datetime(df.date)

In [22]:
df.date.head()

0   2024-08-15
1   2024-08-12
2   2024-08-12
3   2024-08-11
4   2024-08-09
Name: date, dtype: datetime64[ns]

In [23]:
#check for unique values
df.stars.unique()

array([ 5.,  1.,  8.,  3.,  2., 10.,  6.,  9.,  7.,  4., nan])

In [24]:

df.stars.value_counts()

stars
1.0     862
2.0     404
3.0     402
8.0     341
10.0    286
7.0     274
9.0     272
5.0     245
4.0     238
6.0     173
Name: count, dtype: int64

In [25]:

df.isnull().value_counts()

reviews  stars  date   country  verified  cleaned_reviews
False    False  False  False    False     False              3496
         True   False  False    False     False                 3
         False  False  True     False     False                 1
Name: count, dtype: int64

In [26]:

df.country.isnull().value_counts()

country
False    3499
True        1
Name: count, dtype: int64

In [27]:
#drop the rows using index where the country value is null
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [28]:

df.shape

(3499, 6)

In [29]:

#resetting the index
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,cleaned_reviews
0,"Not Verified | Before my flight, I was forced ...",5.0,2024-08-15,United Kingdom,False,Verified Before flight forced ground staff che...
1,✅ Trip Verified | British Airways at its bes...,1.0,2024-08-12,United Kingdom,True,British Airways best Outstanding service from ...
2,✅ Trip Verified | An excellent flight! Despite...,8.0,2024-08-12,Lebanon,True,excellent flight Despite this being hour fligh...
3,✅ Trip Verified | I recently traveled with Bri...,8.0,2024-08-11,United States,True,recently traveled with British Airways mixed e...
4,✅ Trip Verified | My family and I were booke...,3.0,2024-08-09,United Kingdom,True,family were booked leave London Hong Kong Augu...
...,...,...,...,...,...,...
3494,I fly the POS to LGW route on a fairly regular...,1.0,2014-10-22,Trinidad & Tobago,False,route fairly regular basis punctuality flight ...
3495,LGW-AGP and back. 737-400s on both journeys. T...,4.0,2014-10-22,United Kingdom,False,back 400s both journeys These aircraft looking...
3496,Aberdeen-Heathrow-Miami and return World Trave...,3.0,2014-10-20,United Kingdom,False,AberdeenHeathrowMiami return World Traveller P...
3497,LHR-DEN DEN-LHR - always use upper class in VA...,8.0,2014-10-20,United Kingdom,False,always upper class when This flight reminded j...


In [30]:

# export the cleaned data

df.to_csv("cleaned-BA-reviews.csv")