# DATA CLEANING

In [6]:
import pandas as pd

# Load the data
df = pd.read_csv('/Users/st.amean/Desktop/Portfolio Project/BA_DATA/airline_review.csv')


In [2]:
# Inspect the data
print(df.head())  # Display the first few rows
print(df.dtypes)  # Check data types
print(df.isnull().sum())  # Check for missing values

   Unnamed: 0  rating                            header               author  \
0           0       2      service was mediocre at best          Gary Storer   
1           1       2  BA standards continue to decline             A Jensen   
2           2       2      won the race to the bottom"          John Rockett   
3           3       3            Not a reliable airline  Tatiana Bobrovskaya   
4           4       1         It is a national disgrace             A Dawson   

         date           place  \
0  2023-10-03  United Kingdom   
1  2023-10-02  United Kingdom   
2  2023-10-02  United Kingdom   
3  2023-10-02  United Kingdom   
4  2023-09-30  United Kingdom   

                                             content aircraft  traveller_type  \
0   Just returned from Chicago, flew out 10 days ...     A380  Couple Leisure   
1    BA standards continue to decline every time ...     A320        Business   
2    Awful. Business class check in queue just as...     A320  Couple Leisure

In [7]:
import pandas as pd

# Define columns to keep and rating columns to check for zeros
columns_to_keep = ['header', 'author', 'date', 'place', 'content', 'aircraft',
                   'traveller_type', 'seat_type', 'route', 'date_flown',
                   'recommended', 'trip_verified', 'rating', 'seat_comfort',
                   'cabin_staff_service', 'food_beverages', 'ground_service',
                   'value_for_money', 'entertainment']

df = df[columns_to_keep]


### Data clean by remove rows with any empty cells

In [4]:
# Remove rows with any empty cells
df.dropna(inplace=True)

# Print summary to verify changes
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1323 entries, 0 to 2315
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   header               1323 non-null   object
 1   author               1323 non-null   object
 2   date                 1323 non-null   object
 3   place                1323 non-null   object
 4   content              1323 non-null   object
 5   aircraft             1323 non-null   object
 6   traveller_type       1323 non-null   object
 7   seat_type            1323 non-null   object
 8   route                1323 non-null   object
 9   date_flown           1323 non-null   object
 10  recommended          1323 non-null   object
 11  trip_verified        1323 non-null   object
 12  rating               1323 non-null   int64 
 13  seat_comfort         1323 non-null   int64 
 14  cabin_staff_service  1323 non-null   int64 
 15  food_beverages       1323 non-null   int64 
 16  ground_serv

### Handling missing values
#### By Assuming 'aircraft', 'route', 'date_flown' and 'trip_verified' should have no nulls:

In [8]:
# Fill specific columns with default values
df['aircraft'].fillna('Unknown', inplace=True)
df['route'].fillna('No specific route', inplace=True)
df['date_flown'].fillna(df['date_flown'].mode()[0], inplace=True)
df['trip_verified'].fillna('Not Verified', inplace=True)

# Convert date columns to datetime
df['date'] = pd.to_datetime(df['date'])
df['date_flown'] = pd.to_datetime(df['date_flown'])

# Convert 'trip_verified' to boolean
df['trip_verified'] = df['trip_verified'].map({'Verified': True, 'Not Verified': False})

# Standardize 'recommended' column to boolean
df['recommended'] = df['recommended'].map({'yes': True, 'no': False})

# Remove rows with any empty cells
df.dropna(inplace=True)

# Print summary to verify changes
print(df.info())


<class 'pandas.core.frame.DataFrame'>
Index: 2894 entries, 0 to 2897
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   header               2894 non-null   object        
 1   author               2894 non-null   object        
 2   date                 2894 non-null   datetime64[ns]
 3   place                2894 non-null   object        
 4   content              2894 non-null   object        
 5   aircraft             2894 non-null   object        
 6   traveller_type       2894 non-null   object        
 7   seat_type            2894 non-null   object        
 8   route                2894 non-null   object        
 9   date_flown           2894 non-null   datetime64[ns]
 10  recommended          2894 non-null   bool          
 11  trip_verified        2894 non-null   bool          
 12  rating               2894 non-null   int64         
 13  seat_comfort         2894 non-null   i

### Save the cleaned data

In [5]:
# Save the cleaned data the version you prefered
df.to_csv("ba_review.csv", index=False)