# India Aviation — Domestic City Preprocessing


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
import seaborn as sns


## 1) Load & Explore

In [3]:
FILE_PATH = 'city.csv'
df = pd.read_csv(FILE_PATH)
print('Shape:', df.shape)
print('Columns:', list(df.columns))
print('\nHead:')
df.head()

Shape: (56047, 10)
Columns: ['Year', 'Month', 'City1', 'City2', 'PaxToCity2', 'PaxFromCity2', 'FreightToCity2', 'FreightFromCity2', 'MailToCity2', 'MailFromCity2']

Head:


Unnamed: 0,Year,Month,City1,City2,PaxToCity2,PaxFromCity2,FreightToCity2,FreightFromCity2,MailToCity2,MailFromCity2
0,2021,12,ADAMPUR,AHMEDABAD,30.0,0.0,0.0,0.0,0.0,0.0
1,2018,5,ADAMPUR,DELHI,2258.0,2190.0,0.0,0.86,0.0,0.0
2,2018,6,ADAMPUR,DELHI,1984.0,1981.0,0.0,4.33,0.0,0.0
3,2018,7,ADAMPUR,DELHI,2333.0,2205.0,0.0,1.93,0.0,0.0
4,2018,8,ADAMPUR,DELHI,2146.0,1981.0,0.0,0.48,0.0,0.0


In [4]:
print('\nMissing values:')
df.isnull().sum()


Missing values:


Year                    0
Month                   0
City1                   0
City2                   0
PaxToCity2              0
PaxFromCity2            0
FreightToCity2      11695
FreightFromCity2    11694
MailToCity2         11700
MailFromCity2       11697
dtype: int64

In [5]:
print('\nDtypes:')
df.dtypes


Dtypes:


Year                  int64
Month                 int64
City1                object
City2                object
PaxToCity2          float64
PaxFromCity2        float64
FreightToCity2      float64
FreightFromCity2    float64
MailToCity2         float64
MailFromCity2       float64
dtype: object

## 2) Missing Value Imputation


In [None]:
df_proc = df.copy()
df_proc.columns = df_proc.columns.str.strip().str.lower()
numeric_cols = df_proc.select_dtypes(include=['int64','float64']).columns.tolist()
print('Numeric columns:',numeric_cols)
df_proc[numeric_cols] = df_proc[numeric_cols].fillna(df_proc[numeric_cols].median())
print('Missing after imputation:')
df_proc.isnull().sum()


Numeric columns: ['year', 'month', 'paxtocity2', 'paxfromcity2', 'freighttocity2', 'freightfromcity2', 'mailtocity2', 'mailfromcity2']
Missing after imputation:


year                0
month               0
city1               0
city2               0
paxtocity2          0
paxfromcity2        0
freighttocity2      0
freightfromcity2    0
mailtocity2         0
mailfromcity2       0
dtype: int64

## 3) Outlier Detection (Z-score)


In [None]:
z = (df_proc[numeric_cols]-df_proc[numeric_cols].mean())/df_proc[numeric_cols].std(ddof=0)
outlier_mask = (z.abs() > 3)
outlier_counts = outlier_mask.sum().sort_values(ascending=False)
print('Outliers (|z|>3) per column:')
outlier_counts


Outliers (|z|>3) per column:


paxtocity2          1333
paxfromcity2        1317
mailtocity2         1037
mailfromcity2        960
freighttocity2       858
freightfromcity2     836
year                   0
month                  0
dtype: int64

## 4) Data Cleaning & Standardization
- Ensure **Year/Month** are integers
- Create a proper **`date`** column (YYYY-MM-01)
- Upper-case cities
- Standardize numeric features with **z-score** (creates `*_scaled` columns)

In [8]:
expected = {'year','month','city1','city2','paxtocity2','paxfromcity2',
            'freighttocity2','freightfromcity2','mailtocity2','mailfromcity2'}

if 'year' in df_proc.columns:
    df_proc['year'] = pd.to_numeric(df_proc['year']).astype('Int64')
if 'month' in df_proc.columns:
    df_proc['month'] = pd.to_numeric(df_proc['month']).astype('Int64')

if {'year','month'} <= set(df_proc.columns):
    df_proc['date'] = pd.to_datetime(
        df_proc['year'].astype('Int64').astype(str) + '-' + df_proc['month'].astype('Int64').astype(str) + '-01'
    )

for col in ['city1','city2']:
    if col in df_proc.columns:
        df_proc[col] = df_proc[col].astype(str).str.strip().str.upper()

df_proc.head()

Unnamed: 0,year,month,city1,city2,paxtocity2,paxfromcity2,freighttocity2,freightfromcity2,mailtocity2,mailfromcity2,date
0,2021,12,ADAMPUR,AHMEDABAD,30.0,0.0,0.0,0.0,0.0,0.0,2021-12-01
1,2018,5,ADAMPUR,DELHI,2258.0,2190.0,0.0,0.86,0.0,0.0,2018-05-01
2,2018,6,ADAMPUR,DELHI,1984.0,1981.0,0.0,4.33,0.0,0.0,2018-06-01
3,2018,7,ADAMPUR,DELHI,2333.0,2205.0,0.0,1.93,0.0,0.0,2018-07-01
4,2018,8,ADAMPUR,DELHI,2146.0,1981.0,0.0,0.48,0.0,0.0,2018-08-01


## 5) Feature Extraction
- total_passengers = PaxToCity2 + PaxFromCity2
- total_freight = FreightToCity2 + FreightFromCity2 
- total_mail = MailToCity2 + MailFromCity2

In [None]:

if {'paxtocity2','paxfromcity2'} <= set(df_proc.columns):
    df_proc['total_passengers'] = df_proc['paxtocity2'] + df_proc['paxfromcity2']
if {'freighttocity2','freightfromcity2'} <= set(df_proc.columns):
    df_proc['total_freight'] = df_proc['freighttocity2'] + df_proc['freightfromcity2']
if {'mailtocity2','mailfromcity2'} <= set(df_proc.columns):
    df_proc['total_mail'] = df_proc['mailtocity2'] + df_proc['mailfromcity2']

if {'city1','city2','date','total_passengers'} <= set(df_proc.columns):
    df_proc = df_proc.sort_values(['city1','city2','date'])
    df_proc = df_proc.sort_values(['city1','city2','date'])

prev = df_proc.groupby(['city1','city2'])['total_passengers'].shift(1)

df_proc['traffic_type'] = 'domestic'
df_proc.head()

Unnamed: 0,year,month,city1,city2,paxtocity2,paxfromcity2,freighttocity2,freightfromcity2,mailtocity2,mailfromcity2,date,total_passengers,total_freight,total_mail,pax_growth_pct,traffic_type
0,2021,12,ADAMPUR,AHMEDABAD,30.0,0.0,0.0,0.0,0.0,0.0,2021-12-01,30.0,0.0,0.0,0.0,domestic
1,2018,5,ADAMPUR,DELHI,2258.0,2190.0,0.0,0.86,0.0,0.0,2018-05-01,4448.0,0.86,0.0,0.0,domestic
2,2018,6,ADAMPUR,DELHI,1984.0,1981.0,0.0,4.33,0.0,0.0,2018-06-01,3965.0,4.33,0.0,-10.858813,domestic
3,2018,7,ADAMPUR,DELHI,2333.0,2205.0,0.0,1.93,0.0,0.0,2018-07-01,4538.0,1.93,0.0,14.45145,domestic
4,2018,8,ADAMPUR,DELHI,2146.0,1981.0,0.0,0.48,0.0,0.0,2018-08-01,4127.0,0.48,0.0,-9.056853,domestic


In [14]:
OUTPUT_CSV = 'domestic_city_processed.csv'
df_proc.to_csv(OUTPUT_CSV, index=False)
print('Saved:', OUTPUT_CSV)
df_proc.shape

Saved: domestic_city_processed.csv


(56047, 16)

In [15]:
df=pd.read_csv('domestic_city_processed.csv')
df

Unnamed: 0,year,month,city1,city2,paxtocity2,paxfromcity2,freighttocity2,freightfromcity2,mailtocity2,mailfromcity2,date,total_passengers,total_freight,total_mail,pax_growth_pct,traffic_type
0,2021,12,ADAMPUR,AHMEDABAD,30.0,0.0,0.00,0.00,0.0,0.00,2021-12-01,30.0,0.00,0.00,0.000000,domestic
1,2018,5,ADAMPUR,DELHI,2258.0,2190.0,0.00,0.86,0.0,0.00,2018-05-01,4448.0,0.86,0.00,0.000000,domestic
2,2018,6,ADAMPUR,DELHI,1984.0,1981.0,0.00,4.33,0.0,0.00,2018-06-01,3965.0,4.33,0.00,-10.858813,domestic
3,2018,7,ADAMPUR,DELHI,2333.0,2205.0,0.00,1.93,0.0,0.00,2018-07-01,4538.0,1.93,0.00,14.451450,domestic
4,2018,8,ADAMPUR,DELHI,2146.0,1981.0,0.00,0.48,0.0,0.00,2018-08-01,4127.0,0.48,0.00,-9.056853,domestic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56042,2025,2,VIJAYAWADA,VISAKHAPATNAM,5510.0,5304.0,0.00,0.12,0.0,0.80,2025-02-01,10814.0,0.12,0.80,-11.657544,domestic
56043,2025,3,VIJAYAWADA,VISAKHAPATNAM,6011.0,5003.0,0.01,0.18,0.0,1.99,2025-03-01,11014.0,0.19,1.99,1.849454,domestic
56044,2022,10,ZERO AIRPORT,DIBRUGARH,4.0,1.0,0.00,0.00,0.0,0.00,2022-10-01,5.0,0.00,0.00,0.000000,domestic
56045,2022,11,ZIRO,DIBRUGARH,4.0,8.0,0.00,0.00,0.0,0.00,2022-11-01,12.0,0.00,0.00,0.000000,domestic


In [11]:
df.columns

Index(['year', 'month', 'city1', 'city2', 'paxtocity2', 'paxfromcity2',
       'freighttocity2', 'freightfromcity2', 'mailtocity2', 'mailfromcity2',
       'date', 'total_passengers', 'total_freight', 'total_mail',
       'pax_growth_pct', 'traffic_type'],
      dtype='object')