In [1]:
import pandas as pd
import numpy as np
import os 
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("../Data/flight_price.csv")

In [3]:
df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [4]:
def maxMinLength(df, column):
    maxi = 0
    mini = 100 
    for i in df[column]:

        maxi = max(maxi, len(i))
        mini = min(mini, len(i))

    print("MAXIMUM LENGTH :-", maxi)
    print("MINIMUM LENGTH :-", mini)
    return maxi, mini

def toFindUniqueLengthValueCount(df, column):
    Dictionary = {}  

    for i in df[column]:  
        length = len(str(i))  

        if length in Dictionary:
            Dictionary[length] += 1  
        else:
            Dictionary[length] = 1  

    return Dictionary

def toPrintUniqueValue(df, columns):
    mySet = set()
    for i in df[columns]:

        try:
            if len(i) not in mySet:
                print(len(i),"->",  i)
                mySet.add(len(i))
        except:
            print(i)

In [5]:
# METHOD CHAINING
# pandas method never affect actual data
# dictionary comprehension


def convertToMinutes(ser):
    return (
        ser
        .str.split(" ", expand=True) # expand turn series into dataframe
        .set_axis(["hour", "minute"], axis=1)
        .assign(
            hour=lambda df_: (
                df_
                .hour
                .str.replace("h", "")
                .astype(int)
                .mul(60)
            ),
            minute=lambda df_: (
                df_
                .minute
                .str.replace("m", "")
                .fillna("0")
                .astype(int)
            )
        )
        .sum(axis=1)
    )

def cleanData(df):
    extracted = df['Airline'].str.extract(r'^(.*?)\s*(Premium economy|Business)?$')
    return (
        df
        .drop(index=[6474])
        .drop_duplicates()
        .assign(**{
            col: df[col].str.strip()
            for col in df.select_dtypes(include='O').columns
        })
        .rename(columns=str.lower)
        .assign(
            airline=extracted[0].str.title(),  # Extracted Airline Name
            class_=extracted[1].fillna("Economy"),  # Extracted Class, filling NaN with "Economy"
            date_of_journey = lambda df_: pd.to_datetime(df.Date_of_Journey, dayfirst=True),
            dep_time = lambda df_: pd.to_datetime(df.Dep_Time).dt.time,
            arrival_time = lambda df_: pd.to_datetime(df.Arrival_Time.str.split(" ", n=1).str[0]).dt.time,
            duration = lambda df_: convertToMinutes(df_.duration),
            total_stops = lambda df_:(
                df_.total_stops
                .str.replace("non-stop", '0')
                .str.replace("2 stops", "2")
                .str.replace("1 stop", "1")
                .str.replace("3 stops", "3")
                .str.replace("4 stops", '4')
                .pipe(lambda ser: pd.to_numeric(ser))
            ),
            additional_info = lambda df_: df_.additional_info.replace("No info", "No Info")
        )
        .drop(columns='route')
    )



In [6]:
cleanData(df)

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price,class_
0,Indigo,2019-03-24,Banglore,New Delhi,22:20:00,01:10:00,170,0.0,No Info,3897,Economy
1,Air India,2019-05-01,Kolkata,Banglore,05:50:00,13:15:00,445,2.0,No Info,7662,Economy
2,Jet Airways,2019-06-09,Delhi,Cochin,09:25:00,04:25:00,1140,2.0,No Info,13882,Economy
3,Indigo,2019-05-12,Kolkata,Banglore,18:05:00,23:30:00,325,1.0,No Info,6218,Economy
4,Indigo,2019-03-01,Banglore,New Delhi,16:50:00,21:35:00,285,1.0,No Info,13302,Economy
...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,2019-04-09,Kolkata,Banglore,19:55:00,22:25:00,150,0.0,No Info,4107,Economy
10679,Air India,2019-04-27,Kolkata,Banglore,20:45:00,23:20:00,155,0.0,No Info,4145,Economy
10680,Jet Airways,2019-04-27,Banglore,Delhi,08:20:00,11:20:00,180,0.0,No Info,7229,Economy
10681,Vistara,2019-03-01,Banglore,New Delhi,11:30:00,14:10:00,160,0.0,No Info,12648,Economy


In [7]:
cleanedData = cleanData(df)