This jupyter notebook is created for testing the codes in pipeline.py 

In [1]:
#Importing essential libraries
import pandas as pd
from datetime import datetime
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import dill

#Dropping unnecessary columns from the dataset
def drop_unimportant(df : pd.DataFrame) -> pd.DataFrame : 
    return df.drop(columns = ['id', 'airline'])

#Basic feature changer
def change_features(df : pd.DataFrame) -> pd.DataFrame :
    df['flight'] = df['flight'].apply(lambda x : x[0 : 2])
    df['stops'] = df['stops'].apply(lambda x : 0 if (x == 'zero') else 1 if (x == 'one') else 2)
    return df

In [2]:
print('Flight cost predictor pipeline !')

#Data loading
df = pd.read_csv(filepath_or_buffer = './data/train_data.csv', sep = ",")
df.head()

# Preprocess the entire DataFrame first
X = df.drop(columns = 'price')
y = df[['price']]
scale_up = StandardScaler()
y_scaled = scale_up.fit_transform(X = y)

Flight cost predictor pipeline !


In [3]:
X

Unnamed: 0,id,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left
0,1,Vistara,UK-810,Bangalore,Early_Morning,one,Night,Mumbai,Economy,14.25,21
1,2,SpiceJet,SG-5094,Hyderabad,Evening,zero,Night,Kolkata,Economy,1.75,7
2,3,Vistara,UK-846,Bangalore,Morning,one,Evening,Delhi,Business,9.58,5
3,4,Vistara,UK-706,Kolkata,Morning,one,Evening,Hyderabad,Economy,6.75,28
4,5,Indigo,6E-5394,Chennai,Early_Morning,zero,Morning,Mumbai,Economy,2.00,4
...,...,...,...,...,...,...,...,...,...,...,...
19995,19996,Indigo,6E-6178,Bangalore,Night,one,Early_Morning,Mumbai,Economy,7.92,45
19996,19997,AirAsia,I5-582,Kolkata,Morning,one,Afternoon,Delhi,Economy,5.83,24
19997,19998,Vistara,UK-832,Chennai,Early_Morning,two_or_more,Evening,Bangalore,Economy,35.33,17
19998,19999,Vistara,UK-996,Mumbai,Evening,one,Morning,Bangalore,Economy,16.33,21


In [35]:
#Feature engineering
ohe_cols = ['source_city', 'departure_time', 'arrival_time', 'destination_city', 'class', 'flight']
std_scaler = ['duration', 'days_left']
stop_cols = ['stops']

first_feature_engineering = Pipeline(steps = [
    ('drop_cols', FunctionTransformer(drop_unimportant)),
    ('change_features', FunctionTransformer(change_features))
])
numerical_transformer = Pipeline(steps = [
    ('scaler', StandardScaler())
])
ohe_transformation = Pipeline(steps = [
    ('ohe', OneHotEncoder(handle_unknown = 'ignore'))
])
remaining_transformation = Pipeline(steps = [
        ('remaining_features', FunctionTransformer(lambda x : x))
])
column_transformer = ColumnTransformer(transformers=[
    ('numerical', numerical_transformer, std_scaler),
    ('ohe_transformation', ohe_transformation, ohe_cols),
    ('remaining_features', remaining_transformation, stop_cols)
])
preprocessor = Pipeline(steps = [
    ('feature_change', first_feature_engineering),
    ('column_transformer', column_transformer)
])

In [36]:
model = RandomForestRegressor(random_state = 1, n_estimators = 100, min_samples_split = 20, min_samples_leaf = 8, bootstrap = True)

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', model)
])

In [37]:
#Fitting perfect pipeline for whole dataset
pipe.fit(X = X, y = y_scaled)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [38]:
pred = pipe.predict(X = X)

print(f"Mean absolute error of Random Forest Regressor algorithm in train dataset is {mean_absolute_error(y_true = y_scaled, y_pred = pred)}")
print(f"Mean squared error of Random Forest Regressor algorithm in train dataset is {mean_squared_error(y_true = y_scaled, y_pred = pred)}")
print(f"R2 score of Random Forest Regressor algorithm in train dataset is {r2_score(y_true = y_scaled, y_pred = pred)}")

Mean absolute error of Random Forest Regressor algorithm in train dataset is 0.08368097873696786
Mean squared error of Random Forest Regressor algorithm in train dataset is 0.02374439349701673
R2 score of Random Forest Regressor algorithm in train dataset is 0.9762556065029833


In [41]:
model_filename = f'./models/flight_cost.pkl'
dill.dump({'model' : pipe,
    'metadata' :{
        'name' : 'User action predictor',
        'author' : 'Umidjon Sattorov',
        'version' : 1,
        'date' : datetime.now(),
        'type' : type(pipe.named_steps['regressor']).__name__,
        'accuracy' : r2_score(y_true = y_scaled, y_pred = pred)
    }
}, open('./models/flight_cost.pkl', 'wb'))

print(f'Model is saved as {model_filename} in models directory')

Model is saved as ./models/flight_cost.pkl in models directory
