In [1]:
import os
import pandas as pd
import numpy as np

RAW_PATH = os.path.join("data", "raw", "Airline Dataset Updated - v2.csv")
SAMPLE_PATH = os.path.join("data", "processed", "sample_200k.csv")

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)


In [8]:
df = pd.read_csv(os.path.join("..", "data", "raw", "Airline Dataset Updated - v2.csv"), nrows=200_000)
df.shape



(98619, 15)

In [9]:
df.head(15)

Unnamed: 0,Passenger ID,First Name,Last Name,Gender,Age,Nationality,Airport Name,Airport Country Code,Country Name,Airport Continent,Continents,Departure Date,Arrival Airport,Pilot Name,Flight Status
0,ABVWIg,Edithe,Leggis,Female,62,Japan,Coldfoot Airport,US,United States,NAM,North America,6/28/2022,CXF,Fransisco Hazeldine,On Time
1,jkXXAX,Elwood,Catt,Male,62,Nicaragua,Kugluktuk Airport,CA,Canada,NAM,North America,12/26/2022,YCO,Marla Parsonage,On Time
2,CdUz2g,Darby,Felgate,Male,67,Russia,Grenoble-Isère Airport,FR,France,EU,Europe,1/18/2022,GNB,Rhonda Amber,On Time
3,BRS38V,Dominica,Pyle,Female,71,China,Ottawa / Gatineau Airport,CA,Canada,NAM,North America,9/16/2022,YND,Kacie Commucci,Delayed
4,9kvTLo,Bay,Pencost,Male,21,China,Gillespie Field,US,United States,NAM,North America,2/25/2022,SEE,Ebonee Tree,On Time
5,nMJKVh,Lora,Durbann,Female,55,Brazil,Coronel Horácio de Mattos Airport,BR,Brazil,SAM,South America,06-10-2022,LEC,Inglis Dolley,On Time
6,8IPFPE,Rand,Bram,Male,73,Ivory Coast,Duxford Aerodrome,GB,United Kingdom,EU,Europe,10/30/2022,QFO,Stanislas Tiffin,Cancelled
7,pqixbY,Perceval,Dallosso,Male,36,Vietnam,Maestro Wilson Fonseca Airport,BR,Brazil,SAM,South America,04-07-2022,STM,Sharyl Eastmead,Cancelled
8,QNAs2R,Aleda,Pigram,Female,35,Palestinian Territory,Venice Marco Polo Airport,IT,Italy,EU,Europe,8/20/2022,VCE,Daryn Bardsley,On Time
9,3jmudz,Burlie,Schustl,Male,13,Thailand,Vermilion Airport,CA,Canada,NAM,North America,04-06-2022,YVG,Alameda Carlyle,On Time


In [10]:
schema = pd.DataFrame({
    "column": df.columns,
    "dtype": df.dtypes.astype(str)
}).sort_values("dtype")

print("Total columns:", len(df.columns))
schema


Total columns: 15


Unnamed: 0,column,dtype
Age,Age,int64
Passenger ID,Passenger ID,object
First Name,First Name,object
Last Name,Last Name,object
Gender,Gender,object
Nationality,Nationality,object
Airport Name,Airport Name,object
Airport Country Code,Airport Country Code,object
Country Name,Country Name,object
Airport Continent,Airport Continent,object


In [11]:
missing_count = df.isna().sum().sort_values(ascending=False)
missing_pct = (missing_count / len(df) * 100).round(2)

missing_summary = pd.DataFrame({
    "missing_count": missing_count,
    "missing_pct": missing_pct
})

print("Top 20 columns with missing values:")
missing_summary.head(20)


Top 20 columns with missing values:


Unnamed: 0,missing_count,missing_pct
Passenger ID,0,0.0
First Name,0,0.0
Last Name,0,0.0
Gender,0,0.0
Age,0,0.0
Nationality,0,0.0
Airport Name,0,0.0
Airport Country Code,0,0.0
Country Name,0,0.0
Airport Continent,0,0.0


In [12]:
dup_count = df.duplicated().sum()
print("Duplicate rows found:", dup_count)


Duplicate rows found: 0


In [13]:
desc_num = df.describe(include="number").T
print("Numeric summary (first 25 rows):")
desc_num.head(25)


Numeric summary (first 25 rows):


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,98619.0,45.504021,25.929849,1.0,23.0,46.0,68.0,90.0


In [15]:

num_cols = df.select_dtypes(include="number").columns.tolist()
print("Numeric columns found:", num_cols)


neg_counts = {}
for c in num_cols:
    s = pd.to_numeric(df[c], errors="coerce")
    neg_counts[c] = int((s < 0).sum())

neg_table = pd.DataFrame({"negative_values_count": neg_counts}).sort_values("negative_values_count", ascending=False)
print("Columns with negative values (top 15):")
neg_table.head(15)


Numeric columns found: ['Age']
Columns with negative values (top 15):


Unnamed: 0,negative_values_count
Age,0


In [16]:

summary = df[num_cols].describe().T[["min", "max", "mean"]].sort_values("max", ascending=False)
print("Numeric range summary (top 20 by max):")
summary.head(20)


Numeric range summary (top 20 by max):


Unnamed: 0,min,max,mean
Age,1.0,90.0,45.504021


In [17]:
df.isna().sum().sort_values(ascending=False).head(10)

Passenger ID            0
First Name              0
Last Name               0
Gender                  0
Age                     0
Nationality             0
Airport Name            0
Airport Country Code    0
Country Name            0
Airport Continent       0
dtype: int64

In [19]:

processed_path = os.path.join("..", "data", "processed", "week1_sample.csv")

# Save file
df.to_csv(processed_path, index=False)

print("Week 1 processed sample saved at:")
print(processed_path)



Week 1 processed sample saved at:
..\data\processed\week1_sample.csv
