In [5]:
import pandas as pd
import numpy as np
from datetime import datetime

In [6]:
#Get ETL_DATE for Incremental Ingestion (enabled if ETL_DATE set to 'CURRENT_DATE')
from dotenv import load_dotenv
from pathlib import Path
import os

dotenv_path = Path('db_credentials.env')
load_dotenv(dotenv_path=dotenv_path)

ETL_DATE = os.getenv('ETL_DATE')

In [7]:
# For Incremental Ingestion (enabled if ETL_DATE set to 'CURRENT_DATE')
if ETL_DATE == 'CURRENT_DATE':
    ETL_DATE = datetime.today().strftime('%Y%m%d')
else:
   ETL_DATE = '20250322'
    
print(ETL_DATE)

20250403


In [8]:
df_raw = pd.read_csv(f"data/bronze_{ETL_DATE}_Airline_Reviews_withImageUrls.csv")
print(len(df_raw))
df_raw.head()

59


Unnamed: 0.1,Unnamed: 0,Airline Name,Overall_Rating,Review_Title,Review Date,Verified,Review,Top Review Image Url,Aircraft,Type Of Traveller,...,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity,Value For Money,Recommended
0,0,Aeromexico,5,"""Return flight had problems""",30th March 2025,True,I booked London - Mexico City - London Busi...,,Boeing 787,Solo Leisure,...,Mexico City to London,March 2025,5.0,4.0,3.0,1.0,2.0,,3,no
1,1,Aeromexico,1,"""Very disappointed""",29th March 2025,True,Flew from Mexico City to Toronto March 2025...,,,Couple Leisure,...,Mexico city to Toronto,March 2025,1.0,3.0,1.0,5.0,3.0,,2,no
2,2,Air Canada rouge,10,"""showing if the bathroom is occupied""",29th March 2025,False,Flight was awesome. Staff was awesome. My is...,https://www.airlinequality.com/wp-content/uplo...,,Couple Leisure,...,Toronto to Dominican Republic,March 2025,5.0,5.0,5.0,5.0,,5.0,5,yes
3,3,Air India,1,"""can’t carry more than 15 kg""",3rd April 2025,True,I spoke to Air India call center before boo...,,,Solo Leisure,...,Chandigarh to New Delhi,April 2025,,,,1.0,,,1,no
4,4,Air Transat,2,"""insists that my carry-on is too big""",31st March 2025,True,I had the most ridiculous experience on my ...,,A330-200,Couple Leisure,...,Lisbon to Toronto,March 2025,2.0,3.0,2.0,1.0,,,2,no


#### Rename "Unnamed: 0" to "RowId"

In [10]:
df = df_raw.copy()
df.rename(columns={"Unnamed: 0": "RowId"}, inplace=True)

#### Format and Convert Dates

In [12]:
df['Review Date'] = pd.to_datetime(df['Review Date'], format="mixed")
df['Date Flown'] = pd.to_datetime(df['Date Flown'], format="mixed")
print(len(df))
df.head()

59


Unnamed: 0,RowId,Airline Name,Overall_Rating,Review_Title,Review Date,Verified,Review,Top Review Image Url,Aircraft,Type Of Traveller,...,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity,Value For Money,Recommended
0,0,Aeromexico,5,"""Return flight had problems""",2025-03-30,True,I booked London - Mexico City - London Busi...,,Boeing 787,Solo Leisure,...,Mexico City to London,2025-03-01,5.0,4.0,3.0,1.0,2.0,,3,no
1,1,Aeromexico,1,"""Very disappointed""",2025-03-29,True,Flew from Mexico City to Toronto March 2025...,,,Couple Leisure,...,Mexico city to Toronto,2025-03-01,1.0,3.0,1.0,5.0,3.0,,2,no
2,2,Air Canada rouge,10,"""showing if the bathroom is occupied""",2025-03-29,False,Flight was awesome. Staff was awesome. My is...,https://www.airlinequality.com/wp-content/uplo...,,Couple Leisure,...,Toronto to Dominican Republic,2025-03-01,5.0,5.0,5.0,5.0,,5.0,5,yes
3,3,Air India,1,"""can’t carry more than 15 kg""",2025-04-03,True,I spoke to Air India call center before boo...,,,Solo Leisure,...,Chandigarh to New Delhi,2025-04-01,,,,1.0,,,1,no
4,4,Air Transat,2,"""insists that my carry-on is too big""",2025-03-31,True,I had the most ridiculous experience on my ...,,A330-200,Couple Leisure,...,Lisbon to Toronto,2025-03-01,2.0,3.0,2.0,1.0,,,2,no


#### EDA & Check for Nulls (especially for needed key fields)

In [14]:
# Examine nulls (first pass)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   RowId                   59 non-null     int64         
 1   Airline Name            59 non-null     object        
 2   Overall_Rating          59 non-null     int64         
 3   Review_Title            59 non-null     object        
 4   Review Date             59 non-null     datetime64[ns]
 5   Verified                59 non-null     bool          
 6   Review                  59 non-null     object        
 7   Top Review Image Url    6 non-null      object        
 8   Aircraft                16 non-null     object        
 9   Type Of Traveller       59 non-null     object        
 10  Seat Type               59 non-null     object        
 11  Route                   59 non-null     object        
 12  Date Flown              59 non-null     datetime64[n

In [15]:
# Replace any empty string columns with actual nulls
df = df.replace(r'^\s+$', np.nan, regex=True)

#### Replace Overall_Rating 'n' with null, Cast as float

In [17]:
df[df['Overall_Rating'] == 'n']
df['Overall_Rating'] = df['Overall_Rating'].replace('n', np.nan).astype(float)

In [18]:
df[df['Overall_Rating'] == 'n']

Unnamed: 0,RowId,Airline Name,Overall_Rating,Review_Title,Review Date,Verified,Review,Top Review Image Url,Aircraft,Type Of Traveller,...,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity,Value For Money,Recommended


#### Add Unique Id

In [20]:
def generate_hash(row):
    value = f"{row['Airline Name']}|{row['Review_Title']}|{row['Review Date']}"
    return hashlib.sha256(value.encode()).hexdigest()

In [21]:
df['Id'] = df.apply(generate_hash, axis=1)

In [22]:
# Re-examine nulls (after converting empty string to nulls)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   RowId                   59 non-null     int64         
 1   Airline Name            59 non-null     object        
 2   Overall_Rating          59 non-null     float64       
 3   Review_Title            59 non-null     object        
 4   Review Date             59 non-null     datetime64[ns]
 5   Verified                59 non-null     bool          
 6   Review                  59 non-null     object        
 7   Top Review Image Url    6 non-null      object        
 8   Aircraft                16 non-null     object        
 9   Type Of Traveller       59 non-null     object        
 10  Seat Type               59 non-null     object        
 11  Route                   59 non-null     object        
 12  Date Flown              59 non-null     datetime64[n

Key columns needed do not have nulls

#### Persist to CSV file

In [25]:
df.to_csv(f"data/silver_{ETL_DATE}_Airline_Reviews_Preprocessed.csv", index=False, date_format='%Y-%m-%d')