#### Importing Libraries

In [19]:
import pandas as pd
import sklearn 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from feature_engine.datetime import Date
import joblib
import matplotlib.pyplot as plt

#### Display Settings

In [4]:
pd.set_option("display.max_columns", None) #display all columns

In [6]:
sklearn.set_config(transform_output = "pandas") #the output of transformers will be in a dataframe and not a numpy array

#### Import Data

The following datasets are the cleaned versions of original datasets

In [8]:
train = pd.read_csv("https://raw.githubusercontent.com/MisbahullahSheriff/sagemaker-flight-prices-prediction/master/data/train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/MisbahullahSheriff/sagemaker-flight-prices-prediction/master/data/test.csv")
val = pd.read_csv("https://raw.githubusercontent.com/MisbahullahSheriff/sagemaker-flight-prices-prediction/master/data/val.csv")

In [9]:
train.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-06-21,Mumbai,Hyderabad,10:20:00,11:50:00,90,0.0,In-flight meal not included,4995
1,Air India,2019-05-18,Delhi,Cochin,09:00:00,07:40:00,1360,1.0,No Info,8372
2,Air India,2019-06-12,Kolkata,Banglore,09:10:00,11:05:00,1555,2.0,No Info,6117
3,Vistara,2019-04-01,Kolkata,Banglore,20:20:00,22:55:00,1595,1.0,No Info,7770
4,Vistara,2019-06-06,Kolkata,Banglore,17:00:00,10:45:00,1065,1.0,No Info,9187


#### Split data

In [10]:
def split_data(df):
    X = df.drop(columns = "price")
    y = df.price.copy()
    return(X,y)

In [11]:
#training dataset
X_train, y_train = split_data(train)

In [12]:
#test data
X_test, y_test = split_data(test)

In [13]:
#validation set
X_val, y_val = split_data(val)

#### Meta-info

In [14]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          640 non-null    object 
 1   date_of_journey  640 non-null    object 
 2   source           640 non-null    object 
 3   destination      640 non-null    object 
 4   dep_time         640 non-null    object 
 5   arrival_time     640 non-null    object 
 6   duration         640 non-null    int64  
 7   total_stops      640 non-null    float64
 8   additional_info  640 non-null    object 
dtypes: float64(1), int64(1), object(7)
memory usage: 45.1+ KB


#### Data Preprocessing

In [16]:
#numerical columns

num_cols = ["duration", "total_stops"]

#datetime columns

dt_cols  = ["date_of_journey","dep_time","arrival_time"]

#categorical columns
cat_cols = [col for col in X_train.columns if (col not in dt_cols) and (col not in num_cols)]

In [20]:
num_pipe = Pipeline(steps =[
    ("impiter", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipe = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy= "most_frequent")),
    ("encoder", OneHotEncoder(sparse_output=False))
])

In [21]:
#columns transformer helps to apply a set of operations on the columns

preprocessor = ColumnTransformer(transformers= [
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols)
])