In [1]:
!pip install feature-engine




In [2]:
!pip install optuna




In [3]:
!pip install xgboost




In [4]:
import os

# import boto3

import pickle

import warnings

import numpy as np

import pandas as pd

import optuna

from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

import xgboost as xgb

from xgboost import XGBRegressor

import sklearn
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

from sklearn.model_selection import RandomizedSearchCV




### 2. Display Settings

In [5]:
pd.set_option('display.max_columns',None)

In [6]:
sklearn.set_config(transform_output='pandas')
#sklearn has default setting that ot will return series only even if a pandas dataframe is given as simput so this code will make dure that the output is pandas dataframe

In [7]:
warnings.filterwarnings('ignore')

### 3.Read The Data

In [8]:
path = r"C:/Users/shash/OneDrive/Desktop/prediction_project/data/train.csv"
path1 = r"C:/Users/shash/OneDrive/Desktop/prediction_project/data/val.csv"

train = pd.read_csv(path)
train
val = pd.read_csv(path1)

In [9]:
X_train=train.drop(columns='price')
y_train = train.price.copy()

## 4. PreProcessing Operations

In [10]:
air_transformer = Pipeline([
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('Grouper',RareLabelEncoder(tol=0.1,replace_with='Other',n_categories=2)), 
    ## THis groups all the columns with less than 10 percentage and that grouped column will be named as other and there should be atleast 2 categories which need to be grouped
    ('encoder',OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
])

feature_to_extract = ['month','week','day_of_week','day_of_year']
doj_transformer = Pipeline([
    ('dt',DatetimeFeatures(features_to_extract=feature_to_extract,yearfirst=True,format='mixed')) ,
    # The dtype of this feature is object but we are not converting it into datetime type becuase feature engine will do it, so we don't need to do it manualay
    ('scaler',MinMaxScaler())
])

location_pipe1 = Pipeline([
    ('grouper',RareLabelEncoder(tol=0.1,replace_with='other',n_categories=2)),
    ('encoding',MeanEncoder()),
    ('scaler',PowerTransformer())
])
time_pp1=Pipeline([
    ('dt',DatetimeFeatures(features_to_extract=['hour','minute'])),
    ('scaler',MinMaxScaler())
])

def part_of_day(X,morning=4,noon=12,eve=16,night=20):
    columns = X.columns.to_list()
    X_temp=X.assign(**{
        col:pd.to_datetime(X.loc[:,col]).dt.hour
        for col in columns
    })
    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day":np.select(
                [X_temp.loc[:,col].between(morning,noon,inclusive='left'),
                X_temp.loc[:,col].between(noon,eve,inclusive='left'),
                X_temp.loc[:,col].between(eve,night,inclusive='left')],
                ['morning','afternoon','evening'],
                default='night'
            )
            for col in columns
        })
        .drop(columns=columns)
 
    )
time_pipe2=Pipeline([
    ('part1',FunctionTransformer(func=part_of_day)),
    ('encoder',CountFrequencyEncoder())
])

time_transformer=FeatureUnion([
    ('part1',time_pp1),
    ('part2',time_pipe2)
])

class RBFPercentileSimilarity(BaseEstimator,TransformerMixin):
    def __init__(self,variables=None,percentiles=[0.25,0.5,0.75],gamma=0.1):
        self.variables=variables
        self.percentiles=percentiles
        self.gamma=gamma
    def fit(self,X,y=None):
        if not self.variables:
            self.variables=X.select_dtypes(include='number').columns.to_list()
        self.reference_values_ = {
            col: (
                X
                .loc[:,col]
                .quantile(self.percentiles)
                .values
                .reshape(-1,1)
                )
                for col in self.variables
        }
        return self


    def transform(self,X):
        objects=[]
        for col in self.variables:
            columns = [f"{col}_rdf_{int(percentile*100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data=rbf_kernel(X.loc[:,[col]],Y=self.reference_values_[col],gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects,axis=1)

duration_pipe1=Pipeline([
    ('rbf',RBFPercentileSimilarity()),
    ('scaler',PowerTransformer())
])

duration_union=FeatureUnion([
    ('PART1',duration_pipe1),
    ('part2',StandardScaler())
])
duration_transformer=Pipeline([
    ('outliers',Winsorizer(capping_method='iqr',fold=1.5)),
    ('imputer',SimpleImputer(strategy='median')),
    ('union',duration_union)
])

column_transformer = ColumnTransformer([
    ('air',air_transformer,['airline']),
    ('doj',doj_transformer,['date_of_journey']),
    ('pip1',location_pipe1,["source", 'destination']),
    ('time',time_transformer,['dep_time','arrival_time']),
    ('dur',duration_transformer,['duration'])
],remainder='passthrough')

# feature selector
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
estimator=estimator,
scoring="r2",
threshold=0.1
) 

# preprocessor
preprocessor = Pipeline(steps=[
("ct", column_transformer),
("selector", selector)
])

In [11]:
preprocessor.fit(
    train.drop(columns="price"),
    train.price.copy()
)

In [12]:
X_train_pre = preprocessor.transform(train.drop(columns="price"))
y_train = train.price
X_val_pre = preprocessor.transform(val.drop(columns="price"))
y_val = val.price

In [13]:
X_val_pre

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_Other,doj__date_of_journey_week,doj__date_of_journey_day_of_year,pip1__source,pip1__destination,dur__duration_rdf_25,dur__duration,remainder__total_stops,remainder__additional_info
0,0.0,1.0,0.0,0.705882,0.686441,1.105390,1.107091,-0.378668,0.273609,1.0,In-flight meal not included
1,0.0,1.0,0.0,0.823529,0.847458,1.105390,1.107091,-0.378668,1.890363,1.0,In-flight meal not included
2,0.0,1.0,0.0,0.058824,0.042373,-0.777036,-0.820204,-0.378668,1.365443,1.0,No Info
3,0.0,1.0,0.0,0.823529,0.847458,-0.777036,-1.867911,-0.376017,-0.849721,0.0,In-flight meal not included
4,0.0,1.0,0.0,0.588235,0.559322,-0.187547,-0.188090,-0.378668,1.186970,1.0,No Info
...,...,...,...,...,...,...,...,...,...,...,...
155,0.0,1.0,0.0,0.000000,0.000000,-0.777036,-0.820204,-0.378668,-0.030845,1.0,No Info
156,0.0,0.0,1.0,0.941176,0.923729,-0.187547,-0.188090,-0.378668,-0.912711,0.0,No check-in baggage included
157,0.0,0.0,0.0,0.823529,0.847458,-0.187547,-0.188090,-0.378668,0.609558,2.0,No Info
158,1.0,0.0,0.0,0.235294,0.220339,1.105390,1.107091,-0.378668,-0.597759,1.0,No Info


In [14]:
X_train_pre1=X_train_pre.drop(columns='remainder__additional_info')

In [19]:


# Define the model
model = XGBRegressor(random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'learning_rate': np.linspace(0.05, 0.2, 4),
    'max_depth': np.arange(3, 6),
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

# Use RandomizedSearchCV for tuning
random_search = RandomizedSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=3, verbose=1, n_jobs=-1, n_iter=50)

# Preprocess and tune
# X_train_preprocessed = preprocessor.fit_transform(X_train, y_train)
random_search.fit(X_train_pre1, y_train)

# Get the best estimator
best_model = random_search.best_estimator_
print(f"Best parameters found: {random_search.best_params_}")


Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters found: {'subsample': 0.8, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.05, 'colsample_bytree': 0.9}


In [20]:
# Train the final model with the best parameters
best_model.fit(X_train_pre1, y_train)


In [30]:
X_val_pre1=X_val_pre.drop(columns='remainder__additional_info')

In [35]:


# Predict on validation data
y_pred = best_model.predict(X_val_pre1)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

print(f"RMSE: {rmse}")
print(f"R^2: {r2}")


RMSE: 2296.428553666751
R^2: 0.745334804058075


In [37]:
X_val_pre1.shape

(160, 10)

In [54]:
X_val_pre1


Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_Other,doj__date_of_journey_week,doj__date_of_journey_day_of_year,pip1__source,pip1__destination,dur__duration_rdf_25,dur__duration,remainder__total_stops
0,0.0,1.0,0.0,0.705882,0.686441,1.105390,1.107091,-0.378668,0.273609,1.0
1,0.0,1.0,0.0,0.823529,0.847458,1.105390,1.107091,-0.378668,1.890363,1.0
2,0.0,1.0,0.0,0.058824,0.042373,-0.777036,-0.820204,-0.378668,1.365443,1.0
3,0.0,1.0,0.0,0.823529,0.847458,-0.777036,-1.867911,-0.376017,-0.849721,0.0
4,0.0,1.0,0.0,0.588235,0.559322,-0.187547,-0.188090,-0.378668,1.186970,1.0
...,...,...,...,...,...,...,...,...,...,...
155,0.0,1.0,0.0,0.000000,0.000000,-0.777036,-0.820204,-0.378668,-0.030845,1.0
156,0.0,0.0,1.0,0.941176,0.923729,-0.187547,-0.188090,-0.378668,-0.912711,0.0
157,0.0,0.0,0.0,0.823529,0.847458,-0.187547,-0.188090,-0.378668,0.609558,2.0
158,1.0,0.0,0.0,0.235294,0.220339,1.105390,1.107091,-0.378668,-0.597759,1.0


In [52]:
y_val

0      12239
1      10577
2      16736
3       7229
4      13584
       ...  
155    26890
156     3841
157    11201
158     6827
159     3873
Name: price, Length: 160, dtype: int64

In [42]:
XX=np.array([0,1,0,0.705883,0.686444,1.105391,1.107091,-0.3788666,0.272309,1])
XY=np.array([[0.,01.0,0.0,0.705882,0.686441,1.105390,1.107091,-0.378668,0.273609,1.0]])

In [39]:
XX.shape

(10,)

In [55]:
# y_pred = best_model.predict(XX)



# print(y_pred)

import numpy as np

# Reshape XX to be a 2D array with one row
XX = np.array([[0, 2, 0, 0.705883, 1.686444, 1.105391, 1.107091, 1.3788666, 0.272309, 1]])
XX1 = np.array([[0.0,1.0,0.0,0.058824,0.042373,-0.777036,-0.820204,-0.378668,1.365443,1.0]])

# Predict using the best_model
y_pred = best_model.predict(XX1)

# Print or use y_pred as needed
print(y_pred)
# [12384.459]

[16427.209]
