In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt

In [11]:
# Read the preprocessed CSV file
dataset = pd.read_csv('flightprice.csv')

# Inspect the DataFrame
print(dataset.head())

   duration  days_left  price  airline  source_city  departure_time  stops  \
0      2.17          1   5953        4            0               3      0   
1      2.33          1   5953        4            0               0      0   
2      2.17          1   5956        1            0               0      0   
3      2.25          1   5955        5            0               1      0   
4      2.33          1   5955        5            0               1      0   

   arrival_time  destination_city  class  
0             4                 5      0  
1             1                 5      0  
2             0                 5      0  
3             2                 5      0  
4             1                 5      0  


In [12]:
dataset.columns

Index(['duration', 'days_left', 'price', 'airline', 'source_city',
       'departure_time', 'stops', 'arrival_time', 'destination_city', 'class'],
      dtype='object')

In [13]:
indep_X=dataset[['duration', 'days_left', 'airline', 'source_city',
       'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']]

In [14]:
dep_Y=dataset[['price']]

# Feature Selection

# Recursive Feature Elimination

In [2]:
def split_scalar(indep_X,dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)

        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)    
        return X_train, X_test, y_train, y_test

In [3]:
def r2_prediction(regressor,X_test,y_test):
    y_pred = regressor.predict(X_test)
    from sklearn.metrics import r2_score
    r2=r2_score(y_test,y_pred)
    return r2

In [4]:
def Linear(X_train,y_train,X_test):       
        from sklearn.linear_model import LinearRegression
        regressor = LinearRegression()
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2 

In [5]:
def Decision(X_train,y_train,X_test):
        from sklearn.tree import DecisionTreeRegressor
        regressor = DecisionTreeRegressor(random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2 

In [6]:
def random(X_train,y_train,X_test):       
        from sklearn.ensemble import RandomForestRegressor
        regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [7]:
def xgboost(X_train,y_train,X_test):       
        from xgboost import XGBRegressor
        regressor = XGBRegressor(n_jobs=5,learning_rate=0.1,max_depth=10,random_state=1)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return r2

In [15]:
def rfeFeature(indep_X, dep_Y, n):
    rfelist = []
    colnames_list = []  
    # List to store column names for each model
    r2_values = []  
    # List to store R2 values for each model

    from sklearn.linear_model import LinearRegression
    lin = LinearRegression()

    from sklearn.tree import DecisionTreeRegressor
    dec = DecisionTreeRegressor(random_state=0)

    from sklearn.ensemble import RandomForestRegressor
    rf = RandomForestRegressor(n_estimators=10, random_state=0)

    from xgboost import XGBRegressor
    xgb = XGBRegressor(n_jobs=5, learning_rate=0.1, max_depth=10, random_state=1)

    rfemodellist = [lin, dec, rf, xgb]

    for model in rfemodellist:
        log_rfe = RFE(estimator=model, n_features_to_select=n)
        log_fit = log_rfe.fit(indep_X, dep_Y)
        log_rfe_feature = log_fit.transform(indep_X)
        rfelist.append(log_rfe_feature)

        # Get the column names selected by RFE
        selected_columns = [col for col, selected in zip(indep_X.columns, log_rfe.support_) if selected]
        colnames_list.append(selected_columns)

        # Fit the model and calculate and store the R2 value
        X_train, X_test, y_train, y_test = split_scalar(pd.DataFrame(log_rfe_feature), dep_Y)
        model.fit(X_train, y_train)  # Fit the model
        r2 = r2_prediction(model, X_test, y_test)
        r2_values.append(r2)

    return rfelist, colnames_list, r2_values

# Call the function with your data
rfelist, colnames_list, r2_values = rfeFeature(indep_X, dep_Y, 5)

# Print the selected column names and R2 values for each model
for model_name, selected_columns, r2_value in zip(["Linear", "Decision", "Random", "XGBoost"], colnames_list, r2_values):
    print(f"Model: {model_name}")
    print("Selected Columns:", selected_columns)
    print(f"R2 Value: {r2_value}\n")

  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  self.estimator_.fit(X[:, features], y, **fit_params)


Model: Linear
Selected Columns: ['airline', 'source_city', 'stops', 'destination_city', 'class']
R2 Value: 0.9014411925799668

Model: Decision
Selected Columns: ['duration', 'days_left', 'airline', 'source_city', 'class']
R2 Value: 0.9434892102116892

Model: Random
Selected Columns: ['duration', 'days_left', 'airline', 'source_city', 'class']
R2 Value: 0.9575298504156051

Model: XGBoost
Selected Columns: ['duration', 'airline', 'source_city', 'stops', 'class']
R2 Value: 0.9590624618748581



# Model Creation

In [16]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(indep_X, dep_Y, test_size=0.30, random_state=0)
from sklearn.tree import DecisionTreeRegressor
regressor_dt=DecisionTreeRegressor(criterion='squared_error', splitter='random')
regressor_dt=regressor_dt.fit(x_train,y_train)

In [17]:
y_pred=regressor_dt.predict(x_test)

In [18]:
from sklearn.metrics import r2_score
r_score=r2_score(y_test,y_pred)

In [19]:
r_score

0.9726866831886678

In [20]:
from sklearn.ensemble import GradientBoostingRegressor
regressor_gbr = GradientBoostingRegressor(n_estimators=500,max_depth=4,min_samples_split=5,learning_rate=0.01,loss="squared_error")
regressor_gbr.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingRegressor(learning_rate=0.01, max_depth=4, min_samples_split=5,
                          n_estimators=500)

In [21]:
y_pred=regressor_gbr.predict(x_test)

In [22]:
from sklearn.metrics import r2_score
r_score=r2_score(y_test,y_pred)
r_score

0.9541339069207602

In [23]:
from sklearn.ensemble import RandomForestRegressor
regressor_rf = RandomForestRegressor(n_estimators=100,random_state=0)
regressor_rf.fit(x_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestRegressor(random_state=0)

In [24]:
y_pred=regressor_rf.predict(x_test)

In [25]:
r_score

0.9541339069207602

In [26]:
# When comparing, Decision Tree algorithm gives maximum accuracy of 97%

In [27]:
import pickle
Finalised_Model="Finalized_model.sav"

In [28]:
pickle.dump(regressor_dt,open(Finalised_Model,'wb'))