In [89]:
import chardet

with open("datasets/flights_bangladesh.csv", "rb") as f:
    raw_data = f.read(100000)  # Read first 100,000 bytes (adjust if needed)
    result = chardet.detect(raw_data)

print("Detected encoding:", result["encoding"])

Detected encoding: ascii


In [90]:
import pandas as pd

df = pd.read_csv("datasets/flights_bangladesh.csv", encoding="ascii", encoding_errors="replace") 

df

Unnamed: 0,Airline,Source,Source Name,Destination,Destination Name,Departure Date & Time,Arrival Date & Time,Duration (hrs),Stopovers,Aircraft Type,Class,Booking Source,Base Fare (BDT),Tax & Surcharge (BDT),Total Fare (BDT),Seasonality,Days Before Departure
0,Malaysian Airlines,CXB,Cox's Bazar Airport,CCU,Netaji Subhas Chandra Bose International Airpo...,2025-11-17 06:25:00,2025-11-17 07:38:10,1.219526,Direct,Airbus A320,Economy,Online Website,21131.225021,5169.683753,26300.908775,Regular,10
1,Cathay Pacific,BZL,Barisal Airport,CGP,"Shah Amanat International Airport, Chittagong",2025-03-16 00:17:00,2025-03-16 00:53:31,0.608638,Direct,Airbus A320,First Class,Travel Agency,11605.395471,200.000000,11805.395471,Regular,14
2,British Airways,ZYL,"Osmani International Airport, Sylhet",KUL,Kuala Lumpur International Airport,2025-12-13 12:03:00,2025-12-13 14:44:22,2.689651,1 Stop,Boeing 787,Economy,Travel Agency,39882.499349,11982.374902,51864.874251,Winter Holidays,83
3,Singapore Airlines,RJH,"Shah Makhdum Airport, Rajshahi",DAC,"Hazrat Shahjalal International Airport, Dhaka",2025-05-30 03:21:00,2025-05-30 04:02:09,0.686054,Direct,Airbus A320,Economy,Direct Booking,4435.607340,200.000000,4635.607340,Regular,56
4,British Airways,SPD,Saidpur Airport,YYZ,Toronto Pearson International Airport,2025-04-25 09:14:00,2025-04-25 23:17:20,14.055609,1 Stop,Airbus A350,Business,Direct Booking,59243.806146,14886.570922,74130.377068,Regular,90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56995,Kuwait Airways,JSR,Jessore Airport,CCU,Netaji Subhas Chandra Bose International Airpo...,2025-08-11 00:10:00,2025-08-11 00:40:00,0.500000,Direct,Airbus A320,Business,Online Website,79974.471748,13996.170762,93970.642511,Regular,51
56996,Kuwait Airways,CGP,"Shah Amanat International Airport, Chittagong",CCU,Netaji Subhas Chandra Bose International Airpo...,2025-09-19 23:53:00,2025-09-20 01:09:30,1.275145,Direct,Airbus A320,First Class,Online Website,193471.364277,31020.704642,224492.068918,Regular,31
56997,Biman Bangladesh Airlines,CXB,Cox's Bazar Airport,JSR,Jessore Airport,2025-11-08 09:23:00,2025-11-08 10:35:59,1.216583,Direct,Airbus A320,Economy,Direct Booking,4375.365554,200.000000,4575.365554,Regular,22
56998,British Airways,SPD,Saidpur Airport,YYZ,Toronto Pearson International Airport,2025-11-25 10:23:00,2025-11-26 00:20:37,13.960502,1 Stop,Airbus A350,Economy,Direct Booking,40903.602688,12135.540403,53039.143091,Regular,20


In [91]:
df = df.rename(columns={
    "Source": "Departure Airport Code",
    "Source Name": "Dep Airport Name",
    "Destination": "Arrival Airport Code",
    "Destination Name": "Arr Airport Name"
})

In [92]:
# Count total missing values
missing_values = df.isna().sum().sum()
print(f"Total missing values: {missing_values}")

# Print only rows with at least one NaN value
null_rows = df[df.isna().any(axis=1)]
print(null_rows.to_markdown())  # Displays all rows with missing values

Total missing values: 0
| Airline   | Departure Airport Code   | Dep Airport Name   | Arrival Airport Code   | Arr Airport Name   | Departure Date & Time   | Arrival Date & Time   | Duration (hrs)   | Stopovers   | Aircraft Type   | Class   | Booking Source   | Base Fare (BDT)   | Tax & Surcharge (BDT)   | Total Fare (BDT)   | Seasonality   | Days Before Departure   |
|-----------|--------------------------|--------------------|------------------------|--------------------|-------------------------|-----------------------|------------------|-------------|-----------------|---------|------------------|-------------------|-------------------------|--------------------|---------------|-------------------------|


In [93]:
print(df.dtypes)

Airline                    object
Departure Airport Code     object
Dep Airport Name           object
Arrival Airport Code       object
Arr Airport Name           object
Departure Date & Time      object
Arrival Date & Time        object
Duration (hrs)            float64
Stopovers                  object
Aircraft Type              object
Class                      object
Booking Source             object
Base Fare (BDT)           float64
Tax & Surcharge (BDT)     float64
Total Fare (BDT)          float64
Seasonality                object
Days Before Departure       int64
dtype: object


# filter the cities based on what is on the best cities

In [94]:
df_airport = pd.read_csv("datasets/airports.csv", encoding="utf-8", encoding_errors="replace") 
print(df_airport.to_markdown())

|      | Name                                                                     | City                             | Country                          | IATA   | ICAO   |    Latitude |   Longitude |
|-----:|:-------------------------------------------------------------------------|:---------------------------------|:---------------------------------|:-------|:-------|------------:|------------:|
|    0 | Goroka Airport                                                           | Goroka                           | Papua New Guinea                 | GKA    | AYGA   |  -6.08169   |  145.392    |
|    1 | Madang Airport                                                           | Madang                           | Papua New Guinea                 | MAG    | AYMD   |  -5.20708   |  145.789    |
|    2 | Mount Hagen Kagamuga Airport                                             | Mount Hagen                      | Papua New Guinea                 | HGU    | AYMH   |  -5.82679   |  144.296    |


In [95]:
# Merge for Destination
df = df.merge(df_airport[['IATA', 'City', 'Country']], left_on='Arrival Airport Code', right_on='IATA', how='left')
df.rename(columns={'City': 'City Arr', 'Country': 'Country Arr'}, inplace=True)
df.drop(columns=['IATA'], inplace=True)  # Remove extra column

# Merge for Origin
df = df.merge(df_airport[['IATA', 'City', 'Country']], left_on='Departure Airport Code', right_on='IATA', how='left')
df.rename(columns={'City': 'City Dep', 'Country': 'Country Dep'}, inplace=True)
df.drop(columns=['IATA'], inplace=True)  # Remove extra column

# Display the updated DataFrame
print(df.head())

              Airline Departure Airport Code  \
0  Malaysian Airlines                    CXB   
1      Cathay Pacific                    BZL   
2     British Airways                    ZYL   
3  Singapore Airlines                    RJH   
4     British Airways                    SPD   

                       Dep Airport Name Arrival Airport Code  \
0                   Cox's Bazar Airport                  CCU   
1                       Barisal Airport                  CGP   
2  Osmani International Airport, Sylhet                  KUL   
3        Shah Makhdum Airport, Rajshahi                  DAC   
4                       Saidpur Airport                  YYZ   

                                    Arr Airport Name Departure Date & Time  \
0  Netaji Subhas Chandra Bose International Airpo...   2025-11-17 06:25:00   
1      Shah Amanat International Airport, Chittagong   2025-03-16 00:17:00   
2                 Kuala Lumpur International Airport   2025-12-13 12:03:00   
3      Hazrat 

In [96]:
# Define the BDT to EUR conversion rate
bdt_to_eur_rate = 0.0085  # Example exchange rate, update as needed

# Convert BDT columns to EUR
df['Base Fare (EUR)'] = (df['Base Fare (BDT)'] * bdt_to_eur_rate).round(2)
df['Tax & Surcharge (EUR)'] = (df['Tax & Surcharge (BDT)'] * bdt_to_eur_rate).round(2)
df['Total Fare (EUR)'] = (df['Total Fare (BDT)'] * bdt_to_eur_rate).round(2)

# Drop the original BDT columns
df = df.drop(columns=['Total Fare (BDT)', 'Base Fare (BDT)', 'Tax & Surcharge (BDT)'])

## No Germany in Country Origin nor Destination

In [97]:
print("Germany in 'Country Arr':", "Germany" in df["Country Arr"].values)
print("Germany in 'Country Dep':", "Germany" in df["Country Dep"].values)

Germany in 'Country Arr': False
Germany in 'Country Dep': False


In [98]:
print(sorted(df["Country Arr"].dropna().unique()))

['Bangladesh', 'Canada', 'India', 'Malaysia', 'Qatar', 'Saudi Arabia', 'Singapore', 'Thailand', 'Turkey', 'United Arab Emirates', 'United Kingdom', 'United States']


In [99]:
df

Unnamed: 0,Airline,Departure Airport Code,Dep Airport Name,Arrival Airport Code,Arr Airport Name,Departure Date & Time,Arrival Date & Time,Duration (hrs),Stopovers,Aircraft Type,...,Booking Source,Seasonality,Days Before Departure,City Arr,Country Arr,City Dep,Country Dep,Base Fare (EUR),Tax & Surcharge (EUR),Total Fare (EUR)
0,Malaysian Airlines,CXB,Cox's Bazar Airport,CCU,Netaji Subhas Chandra Bose International Airpo...,2025-11-17 06:25:00,2025-11-17 07:38:10,1.219526,Direct,Airbus A320,...,Online Website,Regular,10,Kolkata,India,Cox's Bazar,Bangladesh,179.62,43.94,223.56
1,Cathay Pacific,BZL,Barisal Airport,CGP,"Shah Amanat International Airport, Chittagong",2025-03-16 00:17:00,2025-03-16 00:53:31,0.608638,Direct,Airbus A320,...,Travel Agency,Regular,14,Chittagong,Bangladesh,Barisal,Bangladesh,98.65,1.70,100.35
2,British Airways,ZYL,"Osmani International Airport, Sylhet",KUL,Kuala Lumpur International Airport,2025-12-13 12:03:00,2025-12-13 14:44:22,2.689651,1 Stop,Boeing 787,...,Travel Agency,Winter Holidays,83,Kuala Lumpur,Malaysia,Sylhet Osmani,Bangladesh,339.00,101.85,440.85
3,Singapore Airlines,RJH,"Shah Makhdum Airport, Rajshahi",DAC,"Hazrat Shahjalal International Airport, Dhaka",2025-05-30 03:21:00,2025-05-30 04:02:09,0.686054,Direct,Airbus A320,...,Direct Booking,Regular,56,Dhaka,Bangladesh,Rajshahi,Bangladesh,37.70,1.70,39.40
4,British Airways,SPD,Saidpur Airport,YYZ,Toronto Pearson International Airport,2025-04-25 09:14:00,2025-04-25 23:17:20,14.055609,1 Stop,Airbus A350,...,Direct Booking,Regular,90,Toronto,Canada,Saidpur,Bangladesh,503.57,126.54,630.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56995,Kuwait Airways,JSR,Jessore Airport,CCU,Netaji Subhas Chandra Bose International Airpo...,2025-08-11 00:10:00,2025-08-11 00:40:00,0.500000,Direct,Airbus A320,...,Online Website,Regular,51,Kolkata,India,Jessore,Bangladesh,679.78,118.97,798.75
56996,Kuwait Airways,CGP,"Shah Amanat International Airport, Chittagong",CCU,Netaji Subhas Chandra Bose International Airpo...,2025-09-19 23:53:00,2025-09-20 01:09:30,1.275145,Direct,Airbus A320,...,Online Website,Regular,31,Kolkata,India,Chittagong,Bangladesh,1644.51,263.68,1908.18
56997,Biman Bangladesh Airlines,CXB,Cox's Bazar Airport,JSR,Jessore Airport,2025-11-08 09:23:00,2025-11-08 10:35:59,1.216583,Direct,Airbus A320,...,Direct Booking,Regular,22,Jessore,Bangladesh,Cox's Bazar,Bangladesh,37.19,1.70,38.89
56998,British Airways,SPD,Saidpur Airport,YYZ,Toronto Pearson International Airport,2025-11-25 10:23:00,2025-11-26 00:20:37,13.960502,1 Stop,Airbus A350,...,Direct Booking,Regular,20,Toronto,Canada,Saidpur,Bangladesh,347.68,103.15,450.83


In [100]:
# Convert all object columns to string dtype
df = df.apply(lambda x: x.astype('string') if x.dtype == 'object' else x)

# Check the dtypes to confirm
print(df.dtypes)

Airline                   string[python]
Departure Airport Code    string[python]
Dep Airport Name          string[python]
Arrival Airport Code      string[python]
Arr Airport Name          string[python]
Departure Date & Time     string[python]
Arrival Date & Time       string[python]
Duration (hrs)                   float64
Stopovers                 string[python]
Aircraft Type             string[python]
Class                     string[python]
Booking Source            string[python]
Seasonality               string[python]
Days Before Departure              int64
City Arr                  string[python]
Country Arr               string[python]
City Dep                  string[python]
Country Dep               string[python]
Base Fare (EUR)                  float64
Tax & Surcharge (EUR)            float64
Total Fare (EUR)                 float64
dtype: object


In [101]:
import os

# Define the folder path
folder_path = "converted_datasets"

# Ensure the folder exists
os.makedirs(folder_path, exist_ok=True)

# Define the full file path
file_path = os.path.join(folder_path, "flights_new.csv")

# Save the DataFrame as a CSV file
df.to_csv(file_path, index=False, encoding='utf-8')

print(f"CSV file saved successfully at: {file_path}")


CSV file saved successfully at: converted_datasets\flights_new.csv
