# **Data Cleaning**

In [85]:
import pandas as pd
import numpy as np
import os

In [86]:
cwd = os.getcwd()

df = pd.read_csv(cwd+"/BA_reviews.csv", index_col=0)

In [87]:
#Create column 'verified' to identify which user is verified 
df['verified'] = df.reviews.str.contains("Trip Verified")

## **Cleaning Reviews**

In [88]:
#Remove 'Trip Verified' and 'Not Verified' 
df['reviews'] = df['reviews'].apply(lambda x: x.split('|')[-1].strip())

In [89]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified
0,We are flying Business class for most of our f...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,8th March 2023,United States,False
1,"I am in Australia and on Friday night, went on...",1,6th March 2023,Australia,True
2,At 7.54 am on the day of travel whilst driving...,1,4th March 2023,United Kingdom,True
3,Would happily fly them again. I had a personal...,2,2nd March 2023,United States,True
4,"Flew premium, only worth the extra money for t...",10,2nd March 2023,United Kingdom,False


## **Cleaning Format Date**

In [90]:
df.dtypes

In [91]:
df.date = pd.to_datetime(df.date)

In [92]:
df.dtypes

In [93]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified
0,We are flying Business class for most of our f...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,2023-03-08,United States,False
1,"I am in Australia and on Friday night, went on...",1,2023-03-06,Australia,True
2,At 7.54 am on the day of travel whilst driving...,1,2023-03-04,United Kingdom,True
3,Would happily fly them again. I had a personal...,2,2023-03-02,United States,True
4,"Flew premium, only worth the extra money for t...",10,2023-03-02,United Kingdom,False


## **Cleaning Rating Stars**

In [94]:
#Check for unique values
df.stars.unique()

In [95]:
df.stars.value_counts()

In [96]:
#Remove the \n and \t 
df.stars = df.stars.str.strip("\n\t\t\t\t\t\t\t\t\t\t\t\t\t")

In [97]:
#Remove row where the value is None
df.drop(df[df.stars == "None"].index, axis=0, inplace=True)

## **Check for null values**

In [99]:
df.isnull().value_counts()

In [100]:
df.country.isnull().value_counts()

In [101]:
#Remove rows with null values in 'country' column
df = df.dropna(subset=['country'])

In [103]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified
0,We are flying Business class for most of our f...,5,2023-03-08,United States,False
1,"I am in Australia and on Friday night, went on...",1,2023-03-06,Australia,True
2,At 7.54 am on the day of travel whilst driving...,1,2023-03-04,United Kingdom,True
3,Would happily fly them again. I had a personal...,2,2023-03-02,United States,True
4,"Flew premium, only worth the extra money for t...",10,2023-03-02,United Kingdom,False


In [104]:
df.tail()

Unnamed: 0,reviews,stars,date,country,verified
2995,LHR-JNB-CPT. Upper deck on A380. 2x3x2 arrange...,3,2015-01-11,United Kingdom,False
2996,Los Angeles - Heathrow - Los Angeles over the ...,3,2015-01-11,United Kingdom,False
2997,Travelled Club from MIA to LHR last week havin...,6,2015-01-05,United Kingdom,False
2998,Had a good BA flight out to Barcelona but foun...,3,2015-01-05,United Kingdom,False
2999,LIS-LHR-VIE on Dec. 8 both legs on an Airbus 3...,8,2015-01-05,Austria,False


In [106]:
df.to_csv("BA_reviews_clean.csv")