# Importing libraries


In [28]:
import sklearn 
import joblib
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt

# Display settings 


In [5]:
pd.set_option("display.max_columns",None) # no column will be truncated it will show all

In [8]:
sklearn.set_config(transform_output="pandas") # as sklearn generally outputs numpy array so we are making this setting so that we get answer as pandas dataframe

# Get the data

In [13]:
train_df =pd.read_csv("data/train.csv")
test_df =pd.read_csv("data/test.csv")
val_df =pd.read_csv("data/val.csv")

In [14]:
train_df.sample(3)

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
154,Spicejet,2019-06-06,Banglore,Delhi,05:55:00,08:35:00,160,0.0,No Info,3625
185,Jet Airways,2019-06-09,Banglore,Delhi,07:10:00,10:10:00,180,0.0,In-flight meal not included,7229
181,Vistara,2019-05-06,Kolkata,Banglore,17:00:00,20:20:00,1640,1.0,No Info,8610


### Split the data

In [15]:
def split_data(data):
    X=data.drop(columns="price")
    Y=data.price.copy()
    return (X,Y)

In [16]:
X_train,Y_train = split_data(train_df)

In [17]:
X_train.sample()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info
605,Air India,2019-05-09,Delhi,Cochin,09:00:00,19:15:00,615,1.0,No Info


In [18]:
Y_train.sample()

30    8610
Name: price, dtype: int64

In [19]:
X_val,Y_val = split_data(val_df)

In [20]:
X_test,Y_test = split_data(test_df)

### meta values

In [21]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          640 non-null    object 
 1   date_of_journey  640 non-null    object 
 2   source           640 non-null    object 
 3   destination      640 non-null    object 
 4   dep_time         640 non-null    object 
 5   arrival_time     640 non-null    object 
 6   duration         640 non-null    int64  
 7   total_stops      640 non-null    float64
 8   additional_info  640 non-null    object 
 9   price            640 non-null    int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 50.1+ KB


In [22]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          200 non-null    object 
 1   date_of_journey  200 non-null    object 
 2   source           200 non-null    object 
 3   destination      200 non-null    object 
 4   dep_time         200 non-null    object 
 5   arrival_time     200 non-null    object 
 6   duration         200 non-null    int64  
 7   total_stops      200 non-null    float64
 8   additional_info  200 non-null    object 
 9   price            200 non-null    int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 15.8+ KB


In [23]:
val_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          160 non-null    object 
 1   date_of_journey  160 non-null    object 
 2   source           160 non-null    object 
 3   destination      160 non-null    object 
 4   dep_time         160 non-null    object 
 5   arrival_time     160 non-null    object 
 6   duration         160 non-null    int64  
 7   total_stops      160 non-null    float64
 8   additional_info  160 non-null    object 
 9   price            160 non-null    int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 12.6+ KB


# Data Preprocessing 

In [24]:
dt_cols = ["date_of_journey","dep_time","arrival_time"]
num_cols=["duration","total_stops"]
cat_cols =[col for col in X_train.columns if (col not in dt_cols) and (col not in num_cols)]

In [25]:
cat_cols

['airline', 'source', 'destination', 'additional_info']

In [None]:
f