In [4]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os


import re

In [5]:
df = pd.read_csv("./BA_reviews.csv", index_col=0)
df.head()

Unnamed: 0,reviews,stars,date,country
0,Not Verified | I was excited to fly BA as I'd ...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,18th January 2024,United Kingdom
1,Not Verified | I just want to warn everyone o...,2,17th January 2024,Germany
2,Not Verified | Paid for business class travell...,1,16th January 2024,United Kingdom
3,✅ Trip Verified | The plane was extremely dir...,1,15th January 2024,Ireland
4,Not Verified | Overall journey wasn’t bad howe...,1,12th January 2024,United Kingdom


In [6]:
df['verified'] = df.reviews.str.contains("Trip Verified")
df['verified']

0       False
1       False
2       False
3        True
4       False
        ...  
3495    False
3496    False
3497    False
3498    False
3499    False
Name: verified, Length: 3500, dtype: bool

### Cleaning reviews

In [1]:
!pip show nltk

Name: nltk
Version: 3.7
Summary: Natural Language Toolkit
Home-page: https://www.nltk.org/
Author: NLTK Team
Author-email: nltk.team@gmail.com
License: Apache License, Version 2.0
Location: c:\users\user\anaconda3\lib\site-packages
Requires: click, joblib, regex, tqdm
Required-by: 


In [9]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...


True

In [10]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()

In [11]:
reviews_data = df.reviews.str.strip("✅ Trip Verified |")

corpus =[]

for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

In [12]:
df['corpus'] = corpus
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | I was excited to fly BA as I'd ...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,18th January 2024,United Kingdom,False,verified excited fly ba travelled long haul yr...
1,Not Verified | I just want to warn everyone o...,2,17th January 2024,Germany,False,verified want warn everyone worst customer ser...
2,Not Verified | Paid for business class travell...,1,16th January 2024,United Kingdom,False,verified paid business class travelling cairo ...
3,✅ Trip Verified | The plane was extremely dir...,1,15th January 2024,Ireland,True,plane extremely dirty chocolate smudged mine c...
4,Not Verified | Overall journey wasn’t bad howe...,1,12th January 2024,United Kingdom,False,verified overall journey bad however end bagga...


In [13]:
df.dtypes

reviews     object
stars       object
date        object
country     object
verified      bool
corpus      object
dtype: object

In [14]:
# converting date in datetime format
df.date = pd.to_datetime(df.date)
df.date

0      2024-01-18
1      2024-01-17
2      2024-01-16
3      2024-01-15
4      2024-01-12
          ...    
3495   2014-09-01
3496   2014-09-01
3497   2014-09-01
3498   2014-09-01
3499   2014-09-01
Name: date, Length: 3500, dtype: datetime64[ns]

### Cleaning ratings with stars

In [15]:
df.stars

0       \n\t\t\t\t\t\t\t\t\t\t\t\t\t5
1                                   2
2                                   1
3                                   1
4                                   1
                    ...              
3495                                1
3496                                7
3497                                9
3498                               10
3499                               10
Name: stars, Length: 3500, dtype: object

In [16]:
df.stars.unique()

array(['\n\t\t\t\t\t\t\t\t\t\t\t\t\t5', '2', '1', '4', '9', '6', '8', '5',
       '3', '10', '7', 'None'], dtype=object)

In [18]:
df.stars.value_counts()

1                                846
2                                406
3                                391
8                                342
10                               290
9                                283
7                                279
4                                238
5                                214
6                                173
\n\t\t\t\t\t\t\t\t\t\t\t\t\t5     35
None                               3
Name: stars, dtype: int64

In [20]:
#Droping NOne ratings

df.drop(df[df.stars == "None"].index, axis=0, inplace=True)

In [21]:
df.stars.unique()

array(['\n\t\t\t\t\t\t\t\t\t\t\t\t\t5', '2', '1', '4', '9', '6', '8', '5',
       '3', '10', '7'], dtype=object)

In [22]:
df.stars = df.stars.str.strip("\n\t\t\t\t\t\t\t\t\t\t\t\t\t")

In [23]:
df.stars.value_counts()

1     846
2     406
3     391
8     342
10    290
9     283
7     279
5     249
4     238
6     173
Name: stars, dtype: int64

### Check null values

In [25]:
df.isnull().value_counts()


reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     3495
                       True     False     False        2
dtype: int64

In [26]:
df.country.isnull().value_counts()

False    3495
True        2
Name: country, dtype: int64

In [27]:
#droping the rows using index where the country value is null
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [29]:
df.shape

(3495, 6)

In [31]:
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | I was excited to fly BA as I'd ...,5,2024-01-18,United Kingdom,False,verified excited fly ba travelled long haul yr...
1,Not Verified | I just want to warn everyone o...,2,2024-01-17,Germany,False,verified want warn everyone worst customer ser...
2,Not Verified | Paid for business class travell...,1,2024-01-16,United Kingdom,False,verified paid business class travelling cairo ...
3,✅ Trip Verified | The plane was extremely dir...,1,2024-01-15,Ireland,True,plane extremely dirty chocolate smudged mine c...
4,Not Verified | Overall journey wasn’t bad howe...,1,2024-01-12,United Kingdom,False,verified overall journey bad however end bagga...
...,...,...,...,...,...,...
3490,Flew LHR to Muscat return in Club World on a 7...,1,2014-09-01,United Kingdom,False,flew lhr muscat return club world g zzza loyal...
3491,Economy class on 28 Aug from London (LHR) to N...,7,2014-09-01,United States,False,economy class aug london lhr new york jfk boar...
3492,BA15 LHR to SIN B777-300ER First Class: nice a...,9,2014-09-01,United Kingdom,False,ba lhr sin b er first class nice four month ol...
3493,Travelled as a family of four to the Maldives....,10,2014-09-01,United Kingdom,False,avelled family four maldives despite numerous ...


In [33]:
df.to_csv('./cleaned_BA_reviews.csv')

In [34]:
df.to_csv('../EDA/cleaned_BA_reviews.csv')