In [6]:
# preprocess_data.ipynb

import pandas as pd
import numpy as np

# ✅ 1. Load tab-separated data
df = pd.read_csv('../Data/raw.csv', sep='\t')

# ✅ 2. Date of Journey → Day & Month
df['Journey_day'] = pd.to_datetime(df['Date_of_Journey'], errors='coerce').dt.day
df['Journey_month'] = pd.to_datetime(df['Date_of_Journey'], errors='coerce').dt.month
df.drop('Date_of_Journey', axis=1, inplace=True)

# ✅ 3. Departure Time → Hour & Minute
df['Dep_hour'] = pd.to_datetime(df['Dep_Time']).dt.hour
df['Dep_min'] = pd.to_datetime(df['Dep_Time']).dt.minute
df.drop('Dep_Time', axis=1, inplace=True)

# ✅ 4. Arrival Time → Hour & Minute
df['Arrival_hour'] = pd.to_datetime(df['Arrival_Time']).dt.hour
df['Arrival_min'] = pd.to_datetime(df['Arrival_Time']).dt.minute
df.drop('Arrival_Time', axis=1, inplace=True)

# ✅ 5. Duration → Total minutes
def convert_to_minutes(duration):
    duration = duration.strip().lower().replace('h', ' hours').replace('m', ' minutes')
    if 'hours' not in duration:
        duration = '0 hours ' + duration
    if 'minutes' not in duration:
        duration = duration + ' 0 minutes'
    parts = duration.replace('hours', '').replace('minutes', '').split()
    return int(parts[0]) * 60 + int(parts[1])

df['Duration'] = df['Duration'].apply(convert_to_minutes)

# ✅ 6. Total Stops → Numeric
stop_map = {'non-stop': 0, '1 stop': 1, '2 stops': 2, '3 stops': 3, '4 stops': 4}
df['Total_Stops'] = df['Total_Stops'].map(stop_map)

# ✅ 7. Drop columns not needed
df.drop(['Route', 'Additional_Info'], axis=1, inplace=True)

# ✅ 8. One-hot encode categorical columns
df = pd.get_dummies(df, columns=['Airline', 'Source', 'Destination'], drop_first=True)

# ✅ 9. Save cleaned data
df.to_csv('../Data/processed_data.csv', index=False)

# ✅ 10. Final Check
print("✅ Preprocessing Complete. Final shape:", df.shape)
print("📊 Final Columns:", df.columns.tolist())


  df['Journey_day'] = pd.to_datetime(df['Date_of_Journey'], errors='coerce').dt.day
  df['Journey_month'] = pd.to_datetime(df['Date_of_Journey'], errors='coerce').dt.month
  df['Dep_hour'] = pd.to_datetime(df['Dep_Time']).dt.hour
  df['Dep_min'] = pd.to_datetime(df['Dep_Time']).dt.minute
  df['Arrival_hour'] = pd.to_datetime(df['Arrival_Time']).dt.hour
  df['Arrival_min'] = pd.to_datetime(df['Arrival_Time']).dt.minute


✅ Preprocessing Complete. Final shape: (5999, 29)
📊 Final Columns: ['Duration', 'Total_Stops', 'Price', 'Journey_day', 'Journey_month', 'Dep_hour', 'Dep_min', 'Arrival_hour', 'Arrival_min', 'Airline_Air India', 'Airline_GoAir', 'Airline_IndiGo', 'Airline_Jet Airways', 'Airline_Jet Airways Business', 'Airline_Multiple carriers', 'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet', 'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy', 'Source_Chennai', 'Source_Delhi', 'Source_Kolkata', 'Source_Mumbai', 'Destination_Cochin', 'Destination_Delhi', 'Destination_Hyderabad', 'Destination_Kolkata', 'Destination_New Delhi']


In [5]:
print(df.columns.tolist())


['Airline\tDate_of_Journey\tSource\tDestination\tRoute\tDep_Time\tArrival_Time\tDuration\tTotal_Stops\tAdditional_Info\tPrice']
