In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

In [3]:
#Get ETL_DATE for Incremental Ingestion (enabled if ETL_DATE set to 'CURRENT_DATE')
from dotenv import load_dotenv
from pathlib import Path
import os

dotenv_path = Path('db_credentials.env')
load_dotenv(dotenv_path=dotenv_path)

ETL_DATE = os.getenv('ETL_DATE')

In [4]:
# For Incremental Ingestion (enabled if ETL_DATE set to 'CURRENT_DATE')
if ETL_DATE == 'CURRENT_DATE':
    ETL_DATE = datetime.today().strftime('%Y%m%d')
else:
   ETL_DATE = '20250322'
    
print(ETL_DATE)

20250403


In [5]:
df = pd.read_csv(f"data/silver_{ETL_DATE}_Airline_Reviews_Sentiment.csv")
print(len(df))
df.head()

59


Unnamed: 0,RowId,Airline Name,Overall_Rating,Review_Title,Review Date,Verified,Review,Top Review Image Url,Aircraft,Type Of Traveller,...,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity,Value For Money,Recommended,Id,sentiment_label,sentiment_scores
0,0,Aeromexico,5.0,"""Return flight had problems""",2025-03-30,True,I booked London - Mexico City - London Busi...,,Boeing 787,Solo Leisure,...,4.0,3.0,1.0,2.0,,3,no,4e952d699faaee61d9e384f8d7e5513a4753a56b599f20...,positive,0.648737
1,1,Aeromexico,1.0,"""Very disappointed""",2025-03-29,True,Flew from Mexico City to Toronto March 2025...,,,Couple Leisure,...,3.0,1.0,5.0,3.0,,2,no,934ea944bc1c31893a3eb34b6e9a6a742b254df8c35262...,negative,0.839803
2,2,Air Canada rouge,10.0,"""showing if the bathroom is occupied""",2025-03-29,False,Flight was awesome. Staff was awesome. My is...,https://www.airlinequality.com/wp-content/uplo...,,Couple Leisure,...,5.0,5.0,5.0,,5.0,5,yes,e3d01b5c957953c87a79ff0869dbcd98c2323b0b5a0ca1...,neutral,0.525435
3,3,Air India,1.0,"""can’t carry more than 15 kg""",2025-04-03,True,I spoke to Air India call center before boo...,,,Solo Leisure,...,,,1.0,,,1,no,a116ba41fc9a766f5da46bfcee3468d97f3072da451bc9...,neutral,0.675186
4,4,Air Transat,2.0,"""insists that my carry-on is too big""",2025-03-31,True,I had the most ridiculous experience on my ...,,A330-200,Couple Leisure,...,3.0,2.0,1.0,,,2,no,87739dd31255231aa004e32dc06a99a9bfeec53bfa3ac0...,negative,0.787324


#### Load embeddings from previously generated .npz file

In [7]:
review_text_embeddings = None
review_categories_embeddings = None
EMBEDDING_BACKUP_FILE = f"data/{ETL_DATE}_Airline_Reviews_gte_small.npz"
if os.path.exists(EMBEDDING_BACKUP_FILE):
    # Load the .npz file
    airline_quality_embeddings = np.load(EMBEDDING_BACKUP_FILE)
    
    # List the variables stored in the file
    print(airline_quality_embeddings.files)
    
    # Access the individual arrays
    review_text_embeddings = airline_quality_embeddings['review_text_embeddings']
    review_categories_embeddings = airline_quality_embeddings['review_categories_embeddings']

    print("review_text_embeddings:", review_text_embeddings)
    print("review_categories_embeddings:", review_categories_embeddings)
    
    # Close the file after use
    airline_quality_embeddings.close()
else:
    airline_quality_embeddings = None
    
#airline_quality_embeddings

In [8]:
#review_text_embeddings[0:1]

#### Convert review text to embeddings using thenlper/gte-small

In [10]:
from sentence_transformers import SentenceTransformer

In [11]:
# To fix SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
df_embed = df[['Id', 'Review']].copy()

In [12]:
if review_text_embeddings is None:
    embedding_model = SentenceTransformer("thenlper/gte-small")
    review_text_embeddings = embedding_model.encode(df_embed['Review'])
else:
    review_text_embeddings = review_text_embeddings

review_text_embeddings[0:1]

array([[ 2.76095048e-02, -4.15905118e-02,  5.86969182e-02,
        -1.68393347e-02,  2.24761548e-03,  3.27741653e-02,
         5.65874688e-02,  3.51833291e-02, -8.32355488e-03,
        -2.33900007e-02, -4.04648148e-02,  1.34347379e-02,
         9.78252012e-03,  4.66309600e-02,  5.66584291e-03,
        -7.38564134e-03, -2.54976265e-02, -7.36562833e-02,
        -6.42512217e-02,  4.00423110e-02, -4.24858406e-02,
        -1.40947197e-02, -4.76101488e-02, -3.67927067e-02,
         2.64064912e-02,  2.62129437e-02, -2.77090054e-02,
         4.16991161e-03, -4.67778407e-02, -2.06142217e-01,
        -1.71880405e-02, -6.42185509e-02,  3.30880750e-03,
        -1.65726300e-02,  3.43178846e-02, -4.48502712e-02,
        -1.09934602e-02,  1.92808025e-02,  1.87267754e-02,
         5.35727404e-02,  6.41526580e-02, -7.69293867e-04,
        -4.38310951e-03, -6.75241947e-02, -1.50570329e-02,
        -4.28845696e-02, -7.05798483e-03, -1.61854587e-02,
         5.03493957e-02, -1.51244234e-02,  3.14120539e-0

In [13]:
df_embed['review_gte_small_embeddings'] = review_text_embeddings.tolist()
print(len(df_embed))
df_embed.head()

59


Unnamed: 0,Id,Review,review_gte_small_embeddings
0,4e952d699faaee61d9e384f8d7e5513a4753a56b599f20...,I booked London - Mexico City - London Busi...,"[0.027609504759311676, -0.04159051179885864, 0..."
1,934ea944bc1c31893a3eb34b6e9a6a742b254df8c35262...,Flew from Mexico City to Toronto March 2025...,"[0.02565712295472622, -0.010340875014662743, 0..."
2,e3d01b5c957953c87a79ff0869dbcd98c2323b0b5a0ca1...,Flight was awesome. Staff was awesome. My is...,"[-0.000553521269466728, 0.024896137416362762, ..."
3,a116ba41fc9a766f5da46bfcee3468d97f3072da451bc9...,I spoke to Air India call center before boo...,"[0.003399819601327181, -0.014208048582077026, ..."
4,87739dd31255231aa004e32dc06a99a9bfeec53bfa3ac0...,I had the most ridiculous experience on my ...,"[-0.004671781323850155, -0.0002034426870523020..."


#### Convert review category/ies to embeddings using thenlper/gte-small

In [15]:
review_categories = ['Lost Luggage']
    #, 'Cabin Crew Service', 'Ground Crew Service', 
    #'Seat Comfort', 'In-flight Amenities', 'Safety', 'Cleanliness',
    #'On-time Performance', 'Value for money']

In [16]:
df_review_categories = pd.DataFrame(review_categories, columns=['review_category'])
print(len(df_review_categories))
df_review_categories.head()

1


Unnamed: 0,review_category
0,Lost Luggage


In [17]:
if review_categories_embeddings is None:
    embedding_model = SentenceTransformer("thenlper/gte-small")
    review_categories_embeddings = embedding_model.encode(df_review_categories['review_category'])
else:
    review_categories_embeddings = review_categories_embeddings
    
review_categories_embeddings[0:1]

array([[-1.45721631e-02, -1.47483693e-02,  6.21146150e-02,
        -3.49040627e-02,  2.87266634e-02,  2.60636304e-02,
         8.35142434e-02,  2.47845184e-02, -3.39080244e-02,
        -8.66788160e-03,  2.98559386e-02, -5.27598150e-02,
         7.11026862e-02,  5.20410687e-02,  1.13727460e-02,
         5.30625042e-03,  1.43594220e-02,  1.67635325e-02,
        -8.92897472e-02,  3.19607370e-02,  1.40563874e-02,
        -3.45494412e-02, -7.90021271e-02, -1.59687418e-02,
        -7.97322858e-03,  1.69071723e-02, -2.01837160e-02,
        -3.29448655e-02, -5.25507778e-02, -2.15803638e-01,
        -8.31609964e-03, -7.47866556e-02,  1.21304998e-03,
        -3.39180306e-02,  1.88246649e-02, -3.53139592e-03,
        -1.53122824e-02,  2.21802033e-02,  4.79664514e-03,
         2.83719748e-02,  4.51335721e-02,  2.14884859e-02,
         7.38404412e-03, -3.94356102e-02, -4.01152596e-02,
        -5.14237583e-02, -4.85760951e-03, -1.17152985e-02,
         9.37279314e-02, -8.49065036e-02,  1.74151380e-0

In [18]:
df_review_categories['review_category_gte_small_embeddings'] = review_categories_embeddings.tolist()
df_review_categories[['review_category', 'review_category_gte_small_embeddings']]

Unnamed: 0,review_category,review_category_gte_small_embeddings
0,Lost Luggage,"[-0.014572163112461567, -0.014748369343578815,..."


#### Double check review category embeddings mapping

In [20]:
embedding_model = SentenceTransformer("thenlper/gte-small")
embedding_model.encode(df_review_categories['review_category'][0]).tolist()

[-0.014572163112461567,
 -0.014748369343578815,
 0.062114614993333817,
 -0.03490406274795532,
 0.028726663440465927,
 0.02606363035738468,
 0.0835142433643341,
 0.024784518405795097,
 -0.03390802443027496,
 -0.008667881600558758,
 0.029855938628315926,
 -0.052759815007448196,
 0.07110268622636795,
 0.05204106867313385,
 0.011372745968401432,
 0.005306250415742397,
 0.014359422028064728,
 0.016763532534241676,
 -0.08928974717855453,
 0.03196073696017265,
 0.014056387357413769,
 -0.03454944118857384,
 -0.07900212705135345,
 -0.015968741849064827,
 -0.007973228581249714,
 0.01690717227756977,
 -0.020183715969324112,
 -0.03294486552476883,
 -0.0525507777929306,
 -0.21580363810062408,
 -0.008316099643707275,
 -0.07478665560483932,
 0.0012130499817430973,
 -0.03391803056001663,
 0.018824664875864983,
 -0.003531395923346281,
 -0.015312282368540764,
 0.02218020334839821,
 0.004796645138412714,
 0.02837197482585907,
 0.045133572071790695,
 0.021488485857844353,
 0.0073840441182255745,
 -0.03943

In [21]:
df_review_categories['review_category_gte_small_embeddings'][0]

[-0.014572163112461567,
 -0.014748369343578815,
 0.062114614993333817,
 -0.03490406274795532,
 0.028726663440465927,
 0.02606363035738468,
 0.0835142433643341,
 0.024784518405795097,
 -0.03390802443027496,
 -0.008667881600558758,
 0.029855938628315926,
 -0.052759815007448196,
 0.07110268622636795,
 0.05204106867313385,
 0.011372745968401432,
 0.005306250415742397,
 0.014359422028064728,
 0.016763532534241676,
 -0.08928974717855453,
 0.03196073696017265,
 0.014056387357413769,
 -0.03454944118857384,
 -0.07900212705135345,
 -0.015968741849064827,
 -0.007973228581249714,
 0.01690717227756977,
 -0.020183715969324112,
 -0.03294486552476883,
 -0.0525507777929306,
 -0.21580363810062408,
 -0.008316099643707275,
 -0.07478665560483932,
 0.0012130499817430973,
 -0.03391803056001663,
 0.018824664875864983,
 -0.003531395923346281,
 -0.015312282368540764,
 0.02218020334839821,
 0.004796645138412714,
 0.02837197482585907,
 0.045133572071790695,
 0.021488485857844353,
 0.0073840441182255745,
 -0.03943

In [22]:
#import numpy as np

# Double check review category embeddings mapping
assert np.allclose(
    embedding_model.encode(df_review_categories['review_category'][0]).tolist(), 
    df_review_categories['review_category_gte_small_embeddings'][0],
    rtol=1e-3, atol=1e-4 #increase to allow greater deviation
    #rtol=1e-7, atol=1e-8 #decrease for stricter precision
), "Embeddings do not match within tolerance!"

#### Backup embeddings to a .npz file

In [24]:
np.savez(
    EMBEDDING_BACKUP_FILE,
    review_text_embeddings=review_text_embeddings,
    review_categories_embeddings=review_categories_embeddings
)

#### Compute cosine similarity for review category

In [26]:
review_category = 'Lost Luggage'
review_category_tag = 'lost_luggage'

In [27]:
#from sklearn.metrics.pairwise import cosine_similarity

In [28]:
category_embedding = df_review_categories[df_review_categories['review_category'] \
    == review_category]['review_category_gte_small_embeddings'].values[0]
#category_embedding

In [29]:
#df_embed['review_gte_small_embeddings'][0]

In [30]:
# Compute cosine similarity for each row
df_embed[f'{review_category_tag}_cosim'] = df_embed['review_gte_small_embeddings'].apply(
    lambda row_embedding: cosine_similarity(
        np.array(category_embedding).reshape(1, -1),
        np.array(row_embedding).reshape(1, -1)
    )[0, 0]
)

#### Compute kneighbors distance for review category

In [32]:
#from sklearn.neighbors import NearestNeighbors

In [33]:
# Function to normalize a vector (to remove difference in magnitude of phrase vs paragraph
def normalize_vector(vector):
    norm = np.linalg.norm(vector)  # Compute the norm (magnitude)
    if norm == 0:  # Avoid division by zero
        return vector
    return vector / norm

In [34]:
df_embed['review_gte_small_embeddings']

0     [0.027609504759311676, -0.04159051179885864, 0...
1     [0.02565712295472622, -0.010340875014662743, 0...
2     [-0.000553521269466728, 0.024896137416362762, ...
3     [0.003399819601327181, -0.014208048582077026, ...
4     [-0.004671781323850155, -0.0002034426870523020...
5     [-0.0043205805122852325, 0.0172087624669075, 0...
6     [0.03261007368564606, -0.015512325800955296, 0...
7     [0.02823624201118946, -0.026892907917499542, 0...
8     [-0.008413493633270264, -0.0061997403390705585...
9     [0.004110736772418022, -0.025298554450273514, ...
10    [0.00458001671358943, 0.00043261851533316076, ...
11    [-0.005728758871555328, 0.005572275724261999, ...
12    [-0.011622678488492966, -0.009190471842885017,...
13    [0.0047209965996444225, -0.003211679868400097,...
14    [-0.029618369415402412, 0.008060546591877937, ...
15    [0.009167417883872986, -0.04568776860833168, 0...
16    [0.016673877835273743, 0.007571066729724407, 0...
17    [-0.007122817914932966, 0.0058477255515754

In [35]:
np.vstack(df_embed['review_gte_small_embeddings'].values)

array([[ 0.0276095 , -0.04159051,  0.05869692, ..., -0.00796878,
         0.01650766,  0.03025905],
       [ 0.02565712, -0.01034088,  0.05587865, ..., -0.05538622,
         0.00568719,  0.02984068],
       [-0.00055352,  0.02489614,  0.05799267, ...,  0.01051529,
         0.00315212,  0.04188884],
       ...,
       [-0.04854206, -0.02619073,  0.06042211, ..., -0.0188832 ,
        -0.02563647,  0.05299201],
       [-0.02095315, -0.02636958,  0.03184799, ..., -0.01095588,
         0.06233535,  0.03126774],
       [-0.00306949, -0.00667933,  0.02542065, ..., -0.00643257,
        -0.00318415,  0.03293039]])

In [36]:
normalize_vector(np.vstack(df_embed['review_gte_small_embeddings'].values))

array([[ 3.59445136e-03, -5.41462344e-03,  7.64168786e-03, ...,
        -1.03744626e-03,  2.14911489e-03,  3.93939306e-03],
       [ 3.34027290e-03, -1.34626726e-03,  7.27478064e-03, ...,
        -7.21067107e-03,  7.40409647e-04,  3.88492537e-03],
       [-7.20623313e-05,  3.24120102e-03,  7.55000264e-03, ...,
         1.36897371e-03,  4.10370879e-04,  5.45346301e-03],
       ...,
       [-6.31963792e-03, -3.40974259e-03,  7.86628887e-03, ...,
        -2.45838281e-03, -3.33758435e-03,  6.89897238e-03],
       [-2.72786736e-03, -3.43302679e-03,  4.14625576e-03, ...,
        -1.42633450e-03,  8.11537128e-03,  4.07071251e-03],
       [-3.99613525e-04, -8.69574641e-04,  3.30948636e-03, ...,
        -8.37449593e-04, -4.14541442e-04,  4.28717149e-03]])

In [37]:
# Need to be done only once
nn = NearestNeighbors(n_neighbors=len(df_embed), metric='minkowski')
nn.fit(np.vstack(df_embed['review_gte_small_embeddings'].values)) 
#nn.fit(normalize_vector(np.vstack(df_embed['review_gte_small_embeddings'].values))) 

In [38]:
distances, indices = nn.kneighbors(np.array(category_embedding).reshape(1, -1))
#distances, indices = nn.kneighbors(normalize_vector(np.array(category_embedding).reshape(1, -1)))
print(distances)
print(indices)

[[0.42414954 0.52491443 0.53132516 0.54487148 0.54975254 0.56478796
  0.5700305  0.57040311 0.57187845 0.57196421 0.5749666  0.59097731
  0.59703898 0.60033041 0.6067939  0.60680561 0.60771427 0.60830238
  0.60841293 0.61004087 0.61358878 0.61511324 0.61653661 0.62022707
  0.62049663 0.62255771 0.6232137  0.62389167 0.6267174  0.62915892
  0.63065329 0.63473074 0.63771383 0.63913577 0.64107164 0.6419411
  0.64329588 0.64343914 0.64426272 0.64449265 0.64635877 0.64974951
  0.64994049 0.65315953 0.65451268 0.66042938 0.66225523 0.66291119
  0.66354489 0.66474201 0.66964484 0.67040352 0.67249779 0.67314335
  0.67648764 0.68599336 0.68715068 0.69421141 0.70128861]]
[[29 53 50  6 34 33 36  1 16  5  9 54 58 21  3 42 26 49  4 30 12  0 32 45
  48 10 15 43  8 44 47  7 20 41 17 25 28 52  2 23 18 19 56 31 39 57 11 35
  14 13 38 27 37 51 46 24 22 55 40]]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [39]:
index_kndistance_mapping = dict(zip(indices[0], distances[0]))
#index_kndistance_mapping

In [40]:
# Assign kneighbors distances for each row
df_embed[f'{review_category_tag}_kndist'] = df_embed.index.map(index_kndistance_mapping)

#### Verify threshold for cosine similarity / kneighbors distance for review category

In [42]:
category_threshold = 0.84252

df_embed[df_embed[f'{review_category_tag}_cosim'] >= category_threshold]\
[['Id','Review',f'{review_category_tag}_cosim',f'{review_category_tag}_kndist']]\
    .sort_values(by=[f'{review_category_tag}_cosim'], ascending=True)
#0.843110 cutoff

Unnamed: 0,Id,Review,lost_luggage_cosim,lost_luggage_kndist
34,e4e17b13f946bf0ed53ced145e4e96fa9d3d8a69acb600...,We travelled KLM Glasgow to Amsterdam. Outg...,0.848886,0.549753
6,96b3fe28635ca09da3d4b54b2a2f6c3ebfe0c6e98c11b4...,I am not satisfied with the way they handle...,0.851558,0.544871
50,ad69d0471b664c8518d1379cba789840da5d7a9662e00a...,I took my annual vacation to Cebu Philippine...,0.858847,0.531325
53,b44f7c111f8f74e1f0996e66f17aa01062210b2f9ef3fe...,"The flight was just 2 hours delayed, but th...",0.862232,0.524914
29,f45525eb5ca79f86ce815366c95c0d8af3e1a8c4ab8f53...,Iberia lost luggage department is atrocious...,0.910049,0.42415


In [43]:
# Random check a particular review
df_embed[df_embed.index == 12159]\
[['Id','Review',f'{review_category_tag}_cosim',f'{review_category_tag}_kndist']].to_string()
#false positives: 129, 18850, 7268

'Empty DataFrame\nColumns: [Id, Review, lost_luggage_cosim, lost_luggage_kndist]\nIndex: []'

In [44]:
df_embed[df_embed[f'{review_category_tag}_kndist'] <= 0.560]\
[['Id','Review',f'{review_category_tag}_cosim',f'{review_category_tag}_kndist']]\
    .sort_values(by=[f'{review_category_tag}_kndist'], ascending=False)
#0.56015 cutoff (secondary metric only after cosim)

Unnamed: 0,Id,Review,lost_luggage_cosim,lost_luggage_kndist
34,e4e17b13f946bf0ed53ced145e4e96fa9d3d8a69acb600...,We travelled KLM Glasgow to Amsterdam. Outg...,0.848886,0.549753
6,96b3fe28635ca09da3d4b54b2a2f6c3ebfe0c6e98c11b4...,I am not satisfied with the way they handle...,0.851558,0.544871
50,ad69d0471b664c8518d1379cba789840da5d7a9662e00a...,I took my annual vacation to Cebu Philippine...,0.858847,0.531325
53,b44f7c111f8f74e1f0996e66f17aa01062210b2f9ef3fe...,"The flight was just 2 hours delayed, but th...",0.862232,0.524914
29,f45525eb5ca79f86ce815366c95c0d8af3e1a8c4ab8f53...,Iberia lost luggage department is atrocious...,0.910049,0.42415


#### Add review_category_flag based on threshold

In [46]:
df_embed[f'is_{review_category_tag}_flag'] = df_embed[f'{review_category_tag}_cosim'] >= category_threshold

In [47]:
print(len(df_embed))
df_embed.head()

59


Unnamed: 0,Id,Review,review_gte_small_embeddings,lost_luggage_cosim,lost_luggage_kndist,is_lost_luggage_flag
0,4e952d699faaee61d9e384f8d7e5513a4753a56b599f20...,I booked London - Mexico City - London Busi...,"[0.027609504759311676, -0.04159051179885864, 0...",0.810818,0.615113,False
1,934ea944bc1c31893a3eb34b6e9a6a742b254df8c35262...,Flew from Mexico City to Toronto March 2025...,"[0.02565712295472622, -0.010340875014662743, 0...",0.83732,0.570403,False
2,e3d01b5c957953c87a79ff0869dbcd98c2323b0b5a0ca1...,Flight was awesome. Staff was awesome. My is...,"[-0.000553521269466728, 0.024896137416362762, ...",0.792463,0.644263,False
3,a116ba41fc9a766f5da46bfcee3468d97f3072da451bc9...,I spoke to Air India call center before boo...,"[0.003399819601327181, -0.014208048582077026, ...",0.815901,0.606794,False
4,87739dd31255231aa004e32dc06a99a9bfeec53bfa3ac0...,I had the most ridiculous experience on my ...,"[-0.004671781323850155, -0.0002034426870523020...",0.814917,0.608413,False


#### Merge back review text labels to original dataframe

In [49]:
df_merged = df.merge(df_embed, how='left', on='Id', suffixes=['','_embed']).drop(columns=['Review_embed'])
len(df_merged)
df_merged.head()

Unnamed: 0,RowId,Airline Name,Overall_Rating,Review_Title,Review Date,Verified,Review,Top Review Image Url,Aircraft,Type Of Traveller,...,Wifi & Connectivity,Value For Money,Recommended,Id,sentiment_label,sentiment_scores,review_gte_small_embeddings,lost_luggage_cosim,lost_luggage_kndist,is_lost_luggage_flag
0,0,Aeromexico,5.0,"""Return flight had problems""",2025-03-30,True,I booked London - Mexico City - London Busi...,,Boeing 787,Solo Leisure,...,,3,no,4e952d699faaee61d9e384f8d7e5513a4753a56b599f20...,positive,0.648737,"[0.027609504759311676, -0.04159051179885864, 0...",0.810818,0.615113,False
1,1,Aeromexico,1.0,"""Very disappointed""",2025-03-29,True,Flew from Mexico City to Toronto March 2025...,,,Couple Leisure,...,,2,no,934ea944bc1c31893a3eb34b6e9a6a742b254df8c35262...,negative,0.839803,"[0.02565712295472622, -0.010340875014662743, 0...",0.83732,0.570403,False
2,2,Air Canada rouge,10.0,"""showing if the bathroom is occupied""",2025-03-29,False,Flight was awesome. Staff was awesome. My is...,https://www.airlinequality.com/wp-content/uplo...,,Couple Leisure,...,5.0,5,yes,e3d01b5c957953c87a79ff0869dbcd98c2323b0b5a0ca1...,neutral,0.525435,"[-0.000553521269466728, 0.024896137416362762, ...",0.792463,0.644263,False
3,3,Air India,1.0,"""can’t carry more than 15 kg""",2025-04-03,True,I spoke to Air India call center before boo...,,,Solo Leisure,...,,1,no,a116ba41fc9a766f5da46bfcee3468d97f3072da451bc9...,neutral,0.675186,"[0.003399819601327181, -0.014208048582077026, ...",0.815901,0.606794,False
4,4,Air Transat,2.0,"""insists that my carry-on is too big""",2025-03-31,True,I had the most ridiculous experience on my ...,,A330-200,Couple Leisure,...,,2,no,87739dd31255231aa004e32dc06a99a9bfeec53bfa3ac0...,negative,0.787324,"[-0.004671781323850155, -0.0002034426870523020...",0.814917,0.608413,False


#### Persist to CSV file

In [51]:
df_merged.to_csv(f"data/silver_{ETL_DATE}_Airline_Reviews_LostLuggageLabel.csv", index=False, date_format='%Y-%m-%d')