# 1. Importing Module

In [1]:
import pandas as pd
import numpy as np
import sklearn
import warnings
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from feature_engine.encoding import RareLabelEncoder

# 2. Setting the Config 

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
sklearn.set_config(transform_output="pandas")

In [4]:
warnings.filterwarnings('ignore')

# 3. Getting X and y from training data

In [5]:
train_df = pd.read_csv('../data/train_data.csv')
train_df.info()

<class 'pandas.DataFrame'>
RangeIndex: 6695 entries, 0 to 6694
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          6695 non-null   str    
 1   date_of_journey  6695 non-null   str    
 2   source           6695 non-null   str    
 3   destination      6695 non-null   str    
 4   dep_time         6695 non-null   str    
 5   arrival_time     6695 non-null   str    
 6   duration         6695 non-null   int64  
 7   total_stops      6694 non-null   float64
 8   additional_info  6695 non-null   str    
 9   price            6695 non-null   int64  
dtypes: float64(1), int64(2), str(7)
memory usage: 523.2 KB


In [6]:
train_df[['airline','date_of_journey','source','destination','dep_time','arrival_time','additional_info']] = train_df[['airline','date_of_journey','source','destination','dep_time','arrival_time','additional_info']].astype('object',)

In [7]:
train_df.info()

<class 'pandas.DataFrame'>
RangeIndex: 6695 entries, 0 to 6694
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          6695 non-null   object 
 1   date_of_journey  6695 non-null   object 
 2   source           6695 non-null   object 
 3   destination      6695 non-null   object 
 4   dep_time         6695 non-null   object 
 5   arrival_time     6695 non-null   object 
 6   duration         6695 non-null   int64  
 7   total_stops      6694 non-null   float64
 8   additional_info  6695 non-null   object 
 9   price            6695 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 523.2+ KB


In [8]:
X = train_df.drop(columns=['price'])
y = train_df.price

# 4. Feature engineering

## 4.1. airline

In [9]:
X.airline

0       Jet Airways
1       Jet Airways
2             Goair
3         Air India
4       Jet Airways
           ...     
6690    Jet Airways
6691      Air India
6692    Jet Airways
6693       Air Asia
6694      Air India
Name: airline, Length: 6695, dtype: object

In [24]:
airline_transformer = Pipeline(
steps=[
    ('simple_imputer', SimpleImputer(strategy='most_frequent')),
    # ('group_rare_labels', RareLabelEncoder(tol=0.01,n_categories=3,replace_with='Other',missing_values='ignore')),
    ('one_hot_encoder',OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
]
)

In [25]:
airline_transformer.fit_transform(X[['airline']])

Unnamed: 0,airline_Air Asia,airline_Air India,airline_Goair,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_Spicejet,airline_Trujet,airline_Vistara
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
6690,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6691,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6692,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6693,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
from pandas.core.dtypes.common import is_object_dtype as is_object
is_object(X[['airline']])

False

In [19]:
type(X[['airline']])

pandas.DataFrame