### Importing the Collected Data to Clean it.

In [1]:
import os

# To get the current working directory
cwd = os.getcwd()

# To get the parent of CWD
parent_dir = os.path.dirname(cwd)

#Path to data set
collected_data_path = os.path.join(parent_dir,"dataCollection","BA_reviews.csv")
print("Path to collected data is :",collected_data_path)

Path to collected data is : c:\Users\nikhi\ba_dataScience\dataCollection\BA_reviews.csv


#### Importing Required libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# REGEX parser
import re


In [3]:
df = pd.read_csv(collected_data_path,index_col=0)

In [4]:
df.head()

Unnamed: 0,Reviews,Stars,Date,Country
0,✅ Trip Verified | Have no fear when your BA f...,5.0,21st February 2024,United Kingdom
1,Not Verified | We have flown BA five times fr...,10.0,17th February 2024,United States
2,✅ Trip Verified | London Heathrow to Istanbul...,3.0,17th February 2024,United Kingdom
3,"Not Verified | Jan 30th, I booked a last-minut...",3.0,16th February 2024,United States
4,✅ Trip Verified | I am a British Airways Gold ...,2.0,11th February 2024,United States


We are adding a column whch tells that user trip is verified or not

In [5]:
df["Verified"] = df.Reviews.str.contains("Trip Verified")

In [6]:
df['Verified']

0        True
1       False
2        True
3       False
4        True
        ...  
3495    False
3496    False
3497    False
3498    False
3499    False
Name: Verified, Length: 3500, dtype: bool

In [7]:
df.head()

Unnamed: 0,Reviews,Stars,Date,Country,Verified
0,✅ Trip Verified | Have no fear when your BA f...,5.0,21st February 2024,United Kingdom,True
1,Not Verified | We have flown BA five times fr...,10.0,17th February 2024,United States,False
2,✅ Trip Verified | London Heathrow to Istanbul...,3.0,17th February 2024,United Kingdom,True
3,"Not Verified | Jan 30th, I booked a last-minut...",3.0,16th February 2024,United States,False
4,✅ Trip Verified | I am a British Airways Gold ...,2.0,11th February 2024,United States,True


### Cleaning  Reviews column

In [11]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...


True

In [20]:
# For lemmatization of words we will use nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()

df['Reviews'] = df['Reviews'].str.replace('✅ Trip Verified |', '')
df['Reviews'] = df['Reviews'].str.replace('Not Verified |', '')

reviews_data = df.Reviews.str.strip()

#Creating an empty list to store cleaned reviews

Cleaned_reviews = []

'''
1) Looping through each review
2) Remove punctuations
3) Lower casing
4) Appending to Cleaned_reviews list

'''
for rev in reviews_data:
    rev = re.sub(r'[^\w\s]', '', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words('english'))]
    rev = " ".join(rev)
    Cleaned_reviews.append(rev)
    

In [21]:
len(Cleaned_reviews)

3500

Adding the list to dataframe

In [22]:
df["Cleaned_reviews"] = Cleaned_reviews

In [23]:
df.head()

Unnamed: 0,Reviews,Stars,Date,Country,Verified,Cleaned_reviews
0,Have no fear when your BA flight is operated...,5.0,21st February 2024,United Kingdom,True,fear ba flight operated finnair business class...
1,We have flown BA five times from western USA...,10.0,17th February 2024,United States,False,flown ba five time western usa england always ...
2,London Heathrow to Istanbul at the start of ...,3.0,17th February 2024,United Kingdom,True,london heathrow istanbul start halfterm school...
3,"Jan 30th, I booked a last-minute, transatlant...",3.0,16th February 2024,United States,False,jan 30th booked lastminute transatlantic fligh...
4,I am a British Airways Gold Guest List member...,2.0,11th February 2024,United States,True,british airway gold guest list member lifetime...


In [24]:
df.dtypes

Reviews             object
Stars              float64
Date                object
Country             object
Verified              bool
Cleaned_reviews     object
dtype: object

In [27]:
# Converting date col datatype into datetime
# preprcessing the date columns by removing ordinal indicators(e.g., "st","rd","th")
df.Date = df.Date.str.replace(r'(\d+)(st|nd|rd|th)',r'\1',regex=True)
df.Date = pd.to_datetime(df.Date,format='%d %B %Y')

In [28]:
df.Date.head()

0   2024-02-21
1   2024-02-17
2   2024-02-17
3   2024-02-16
4   2024-02-11
Name: Date, dtype: datetime64[ns]

##### Cleaning the Stars Column

In [29]:
df.Stars.unique()

array([ 5., 10.,  3.,  2.,  1.,  8.,  6.,  9.,  4.,  7., nan])

In [32]:
df.Stars.value_counts()


Stars
1.0     844
2.0     406
3.0     395
8.0     343
10.0    288
9.0     281
7.0     277
5.0     249
4.0     239
6.0     175
Name: count, dtype: int64

In [40]:
count = 0
for i in df.Stars:
    if str(i) == 'nan':
        count +=1
print(count)

3


In [50]:
# Droping rows the vlaues of rating is nan

df.dropna(inplace=True)

In [51]:
df.Stars.unique()

array([ 5., 10.,  3.,  2.,  1.,  8.,  6.,  9.,  4.,  7.])

In [52]:
df.isnull().sum()

Reviews            0
Stars              0
Date               0
Country            0
Verified           0
Cleaned_reviews    0
dtype: int64

In [53]:
df.shape

(3495, 6)

In [54]:
#resetting the index
df.reset_index(drop = True)

Unnamed: 0,Reviews,Stars,Date,Country,Verified,Cleaned_reviews
0,Have no fear when your BA flight is operated...,5.0,2024-02-21,United Kingdom,True,fear ba flight operated finnair business class...
1,We have flown BA five times from western USA...,10.0,2024-02-17,United States,False,flown ba five time western usa england always ...
2,London Heathrow to Istanbul at the start of ...,3.0,2024-02-17,United Kingdom,True,london heathrow istanbul start halfterm school...
3,"Jan 30th, I booked a last-minute, transatlant...",3.0,2024-02-16,United States,False,jan 30th booked lastminute transatlantic fligh...
4,I am a British Airways Gold Guest List member...,2.0,2024-02-11,United States,True,british airway gold guest list member lifetime...
...,...,...,...,...,...,...
3490,LHR-MCT-LHR. Excellent outbound flight. Tasty ...,10.0,2014-09-09,United Kingdom,False,lhrmctlhr excellent outbound flight tasty food...
3491,Economy class on 28 Aug from London (LHR) to N...,9.0,2014-09-06,United Kingdom,False,economy class 28 aug london lhr new york jfk b...
3492,I'm regularly flying Perth-Hong Kong-London-Po...,2.0,2014-09-06,Australia,False,im regularly flying perthhong konglondonport s...
3493,Travelled LHR to Montreal 21st August 2014 and...,5.0,2014-09-06,United Kingdom,False,travelled lhr montreal 21st august 2014 montre...


Exporting the data into csv file

In [55]:
cwd = os.getcwd()
df.to_csv(cwd+"/Processed_data.csv")