In [2]:
import os
import pandas as pd
from tqdm.notebook import tqdm
from datetime import datetime

In [3]:
#Get ETL_DATE for Incremental Ingestion (enabled if ETL_DATE set to 'CURRENT_DATE')
from dotenv import load_dotenv
from pathlib import Path
import os

dotenv_path = Path('db_credentials.env')
load_dotenv(dotenv_path=dotenv_path)

ETL_DATE = os.getenv('ETL_DATE')

In [4]:
# For Incremental Ingestion (enabled if ETL_DATE set to 'CURRENT_DATE')
if ETL_DATE == 'CURRENT_DATE':
    ETL_DATE = datetime.today().strftime('%Y%m%d')
else:
   ETL_DATE = '20250322'
    
print(ETL_DATE)

20250403


In [5]:
data = pd.read_csv(f"data/silver_{ETL_DATE}_Airline_Reviews_Preprocessed.csv")
data.shape

(59, 22)

In [6]:
print(len(data))
data.head()

59


Unnamed: 0,RowId,Airline Name,Overall_Rating,Review_Title,Review Date,Verified,Review,Top Review Image Url,Aircraft,Type Of Traveller,...,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity,Value For Money,Recommended,Id
0,0,Aeromexico,5.0,"""Return flight had problems""",2025-03-30,True,I booked London - Mexico City - London Busi...,,Boeing 787,Solo Leisure,...,2025-03-01,5.0,4.0,3.0,1.0,2.0,,3,no,4e952d699faaee61d9e384f8d7e5513a4753a56b599f20...
1,1,Aeromexico,1.0,"""Very disappointed""",2025-03-29,True,Flew from Mexico City to Toronto March 2025...,,,Couple Leisure,...,2025-03-01,1.0,3.0,1.0,5.0,3.0,,2,no,934ea944bc1c31893a3eb34b6e9a6a742b254df8c35262...
2,2,Air Canada rouge,10.0,"""showing if the bathroom is occupied""",2025-03-29,False,Flight was awesome. Staff was awesome. My is...,https://www.airlinequality.com/wp-content/uplo...,,Couple Leisure,...,2025-03-01,5.0,5.0,5.0,5.0,,5.0,5,yes,e3d01b5c957953c87a79ff0869dbcd98c2323b0b5a0ca1...
3,3,Air India,1.0,"""can’t carry more than 15 kg""",2025-04-03,True,I spoke to Air India call center before boo...,,,Solo Leisure,...,2025-04-01,,,,1.0,,,1,no,a116ba41fc9a766f5da46bfcee3468d97f3072da451bc9...
4,4,Air Transat,2.0,"""insists that my carry-on is too big""",2025-03-31,True,I had the most ridiculous experience on my ...,,A330-200,Couple Leisure,...,2025-03-01,2.0,3.0,2.0,1.0,,,2,no,87739dd31255231aa004e32dc06a99a9bfeec53bfa3ac0...


### Hugging Face - cardiffnlp/twitter-roberta-base-sentiment-latest

In [8]:
from transformers import pipeline
RBSL_PN_MODEL = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")
RBSL_PN_MODEL('I am happy')

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


[{'label': 'positive', 'score': 0.9614965319633484}]

In [9]:

# Make sure reviews are in the right format (list of strings)
review_texts = data['Review'].fillna("").astype(str).tolist()

# Process in batches
batch_size = 32
all_results = []

for i in tqdm(range(0, len(review_texts), batch_size), desc="Processing reviews"):
    batch = review_texts[i:i + batch_size]
    try:
        batch_results = RBSL_PN_MODEL(
            batch,
            truncation=True,
            max_length=512
        )
        all_results.extend(batch_results)
    except Exception as e:
        print(f"Error in batch {i}-{i+batch_size}: {e}")
        all_results.extend([{'label': 'ERROR', 'score': 0.0} for _ in batch])

# Convert to DataFrame
sentiments_df = pd.DataFrame(all_results)
sentiments_df.columns = ['sentiment_label', 'sentiment_scores']

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Processing reviews:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
print(len(sentiments_df))
sentiments_df.head()

59


Unnamed: 0,sentiment_label,sentiment_scores
0,positive,0.648737
1,negative,0.839803
2,neutral,0.525435
3,neutral,0.675186
4,negative,0.787324


#### Merge back review text labels to original dataframe

In [12]:
df_merged = data.join(sentiments_df, how='left')
print(len(df_merged))
df_merged.head()

59


Unnamed: 0,RowId,Airline Name,Overall_Rating,Review_Title,Review Date,Verified,Review,Top Review Image Url,Aircraft,Type Of Traveller,...,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity,Value For Money,Recommended,Id,sentiment_label,sentiment_scores
0,0,Aeromexico,5.0,"""Return flight had problems""",2025-03-30,True,I booked London - Mexico City - London Busi...,,Boeing 787,Solo Leisure,...,4.0,3.0,1.0,2.0,,3,no,4e952d699faaee61d9e384f8d7e5513a4753a56b599f20...,positive,0.648737
1,1,Aeromexico,1.0,"""Very disappointed""",2025-03-29,True,Flew from Mexico City to Toronto March 2025...,,,Couple Leisure,...,3.0,1.0,5.0,3.0,,2,no,934ea944bc1c31893a3eb34b6e9a6a742b254df8c35262...,negative,0.839803
2,2,Air Canada rouge,10.0,"""showing if the bathroom is occupied""",2025-03-29,False,Flight was awesome. Staff was awesome. My is...,https://www.airlinequality.com/wp-content/uplo...,,Couple Leisure,...,5.0,5.0,5.0,,5.0,5,yes,e3d01b5c957953c87a79ff0869dbcd98c2323b0b5a0ca1...,neutral,0.525435
3,3,Air India,1.0,"""can’t carry more than 15 kg""",2025-04-03,True,I spoke to Air India call center before boo...,,,Solo Leisure,...,,,1.0,,,1,no,a116ba41fc9a766f5da46bfcee3468d97f3072da451bc9...,neutral,0.675186
4,4,Air Transat,2.0,"""insists that my carry-on is too big""",2025-03-31,True,I had the most ridiculous experience on my ...,,A330-200,Couple Leisure,...,3.0,2.0,1.0,,,2,no,87739dd31255231aa004e32dc06a99a9bfeec53bfa3ac0...,negative,0.787324


#### Persist to CSV file

In [14]:
df_merged.to_csv(f"data/silver_{ETL_DATE}_Airline_Reviews_Sentiment.csv", index=False, date_format='%Y-%m-%d')