# Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Importing training data
path = r"D:\Nikss\Projects\Flight Price Prediction\Datasets\train_data.csv"
df = pd.read_csv(path)

In [4]:
df.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Spicejet,2019-05-01,Banglore,Delhi,21:10:00,00:05:00,175,0.0,No Info,4991
1,Vistara,2019-06-01,Chennai,Kolkata,17:45:00,20:05:00,140,0.0,No Info,11982
2,Jet Airways,2019-06-09,Delhi,Cochin,17:30:00,12:35:00,1145,1.0,In-flight meal not included,10262
3,Air India,2019-05-18,Kolkata,Banglore,12:00:00,18:30:00,1830,2.0,No Info,10361
4,Jet Airways,2019-03-24,Mumbai,Hyderabad,15:50:00,17:20:00,90,0.0,In-flight meal not included,2228


In [5]:
df.dtypes

airline             object
date_of_journey     object
source              object
destination         object
dep_time            object
arrival_time        object
duration             int64
total_stops        float64
additional_info     object
price                int64
dtype: object

# 2. Fixing data types
* convert `date_of_journey`, `dep_time` and `arrival_time` to `datetime` format
* convert  `total_stops` as categorical as it is a categorical column

In [6]:
# Converting dtypes of some columns to datetime

# convert function
def convert_datetime(column_name, format = None, time = False):

    if time:
        df[column_name] = pd.to_datetime(df[column_name], errors = 'coerce', format = format).dt.time
    
    else:   
        df[column_name] = pd.to_datetime(df[column_name], errors = 'coerce', format = format)


# df['date_of_journey'] = pd.to_datetime(df['date_of_journey'], errors='coerce')

convert_datetime(column_name = 'date_of_journey')
convert_datetime(column_name = 'dep_time', format  = "%H:%M:%S", time = True)
convert_datetime(column_name = 'arrival_time', format  = "%H:%M:%S", time = True)

# converting total_stops to categorical column
# df['total_stops'] = df['total_stops'].astype('category')

# Will do at the time of Training bcz We may need it in EDA

In [7]:
df.dtypes

# dep_time and arrival_time are converted to datetime.time dtype but here it shows object bcz this is how Pandas work.

airline                    object
date_of_journey    datetime64[ns]
source                     object
destination                object
dep_time                   object
arrival_time               object
duration                    int64
total_stops               float64
additional_info            object
price                       int64
dtype: object

In [8]:
df['dep_time'][0]

# See here, it shows datetime object

datetime.time(21, 10)

In [9]:
df[['date_of_journey', 'dep_time', 'arrival_time']]

Unnamed: 0,date_of_journey,dep_time,arrival_time
0,2019-05-01,21:10:00,00:05:00
1,2019-06-01,17:45:00,20:05:00
2,2019-06-09,17:30:00,12:35:00
3,2019-05-18,12:00:00,18:30:00
4,2019-03-24,15:50:00,17:20:00
...,...,...,...
7318,2019-05-15,10:20:00,17:35:00
7319,2019-05-21,10:35:00,21:00:00
7320,2019-05-01,20:25:00,10:55:00
7321,2019-06-03,07:00:00,21:00:00


# 3. Detailed Analysis
* Summary
* Univariate Plots
* Bivariate Plots (w.r.t Target Variables)
* Hypothesis Testing
* Multivariate Plots
* Inspect missinh values and extreme values in depth
    * Filter for necessary subsets
    * Inspect values of other features
* Note observations

In [10]:
df.columns

Index(['airline', 'date_of_journey', 'source', 'destination', 'dep_time',
       'arrival_time', 'duration', 'total_stops', 'additional_info', 'price'],
      dtype='object')

In [14]:
def display_html(size=3, content="content"):
  display(HTML(f"<h{size}>{content}</h{size}>"))

In [27]:
# Airline

from IPython.display import display, HTML

def cat_summary(df, column):

    # title
    data = df[column]
    
    display_html(2, column)
    display_html(3, 'Quick Glance:')
    display(data)

    # meta data
    display_html(3, 'Meta Data:')
    missing_values = df[column].isnull().sum()
    total_values = len(df)
    missing_precentage = ((missing_values / total_values) * 100).round(2)
    dtype = df[column].dtype
    unique_values = df[column].nunique(dropna = True)

    print(f"{'Missing Data':15}: {missing_values} rows ({missing_precentage}%)")
    print(f"{'Data type':15}: {dtype}")
    print(f"{'Unique Values':15}: {unique_values} unique values")

In [28]:
cat_summary(df, 'airline')

0                Spicejet
1                 Vistara
2             Jet Airways
3               Air India
4             Jet Airways
              ...        
7318          Jet Airways
7319               Indigo
7320          Jet Airways
7321    Multiple Carriers
7322          Jet Airways
Name: airline, Length: 7323, dtype: object

Missing Data   : 0 rows (0.0%)
Data type      : object
Unique Values  : 9 unique values


In [16]:
df['airline'].value_counts(normalize=True)

airline
Jet Airways          0.353271
Indigo               0.195139
Air India            0.162365
Multiple Carriers    0.116755
Spicejet             0.077427
Vistara              0.046839
Air Asia             0.030042
Goair                0.018025
Trujet               0.000137
Name: proportion, dtype: float64

80.0