<a href="https://colab.research.google.com/github/Suresh045/TNSDC/blob/main/Feature_Selection_regression_kbest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
import pickle
import matplotlib.pyplot as plt


In [None]:
# Load the dataset
dataset = pd.read_csv("flightprice.csv")

# Inspect dataset
print(dataset.head())
print("\nColumns:", dataset.columns)
print("\nShape:", dataset.shape)

   duration  days_left   airline source_city departure_time  stops  \
0       223          4    IndiGo     Kolkata      Afternoon      0   
1       249         29     GoAir       Delhi          Night      0   
2       119         17     GoAir       Delhi      Afternoon      0   
3       131         26    IndiGo      Mumbai        Evening      0   
4        86          3  SpiceJet       Delhi        Evening      0   

  arrival_time destination_city     class  price  
0    Afternoon        Bangalore   Economy  14087  
1      Morning          Kolkata   Economy   6582  
2        Night          Kolkata  Business  12654  
3      Evening        Hyderabad   Economy   8514  
4      Evening          Chennai  Business  11785  

Columns: Index(['duration', 'days_left', 'airline', 'source_city', 'departure_time',
       'stops', 'arrival_time', 'destination_city', 'class', 'price'],
      dtype='object')

Shape: (100, 10)


In [None]:
# Copy dataset
df = dataset.copy()

# Encode categorical columns
categorical_cols = ['airline', 'source_city', 'departure_time',
                    'arrival_time', 'destination_city', 'class']

encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = encoder.fit_transform(df[col])

print(df.head())


   duration  days_left  airline  source_city  departure_time  stops  \
0       223          4        2            4               0      0   
1       249         29        1            2               3      0   
2       119         17        1            2               0      0   
3       131         26        2            5               1      0   
4        86          3        3            2               1      0   

   arrival_time  destination_city  class  price  
0             0                 0      1  14087  
1             2                 4      1   6582  
2             3                 4      0  12654  
3             1                 3      1   8514  
4             1                 1      0  11785  


In [None]:
indep_X = df[['duration', 'days_left', 'airline', 'source_city',
              'departure_time', 'stops', 'arrival_time',
              'destination_city', 'class']]

dep_Y = df[['price']]


In [None]:
def select_kbest_features(indep_X, dep_Y, k):
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(indep_X)

    selector = SelectKBest(score_func=f_regression, k=k)
    X_new = selector.fit_transform(X_scaled, dep_Y.values.ravel())

    selected_columns = [col for col, selected in zip(indep_X.columns, selector.get_support()) if selected]
    return X_new, selected_columns

X_new, selected_columns = select_kbest_features(indep_X, dep_Y, k=5)
print("Selected Columns using SelectKBest:", selected_columns)


Selected Columns using SelectKBest: ['duration', 'airline', 'source_city', 'departure_time', 'stops']


In [None]:
def split_scalar(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_scalar(X_new, dep_Y)


In [None]:
def r2_prediction(regressor, X_test, y_test):
    y_pred = regressor.predict(X_test)
    return r2_score(y_test, y_pred)

def Linear(X_train, y_train, X_test, y_test):
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    return r2_prediction(regressor, X_test, y_test)

def Decision(X_train, y_train, X_test, y_test):
    regressor = DecisionTreeRegressor(random_state=0)
    regressor.fit(X_train, y_train)
    return r2_prediction(regressor, X_test, y_test)

def Random(X_train, y_train, X_test, y_test):
    regressor = RandomForestRegressor(n_estimators=10, random_state=0)
    regressor.fit(X_train, y_train)
    return r2_prediction(regressor, X_test, y_test)

def XGBoost(X_train, y_train, X_test, y_test):
    regressor = XGBRegressor(n_jobs=5, learning_rate=0.1, max_depth=10, random_state=1)
    regressor.fit(X_train, y_train)
    return r2_prediction(regressor, X_test, y_test)


In [None]:
r2_linear = Linear(X_train, y_train, X_test, y_test)
r2_decision = Decision(X_train, y_train, X_test, y_test)
r2_random = Random(X_train, y_train, X_test, y_test)
r2_xgb = XGBoost(X_train, y_train, X_test, y_test)

print("\nR2 Values with SelectKBest:")
print(f"Linear Regression: {r2_linear}")
print(f"Decision Tree: {r2_decision}")
print(f"Random Forest: {r2_random}")
print(f"XGBoost: {r2_xgb}")


  return fit_method(estimator, *args, **kwargs)



R2 Values with SelectKBest:
Linear Regression: 0.013949837548421273
Decision Tree: -0.413233779836373
Random Forest: 0.12450025088115269
XGBoost: 0.029514074325561523


In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_new, dep_Y, test_size=0.30, random_state=0)
regressor_dt = DecisionTreeRegressor(criterion='squared_error', splitter='random')
regressor_dt.fit(x_train, y_train)

y_pred = regressor_dt.predict(x_test)
r_score = r2_score(y_test, y_pred)
print(f"\nFinal Decision Tree R2 Score: {r_score}")



Final Decision Tree R2 Score: -1.8929022649616143


In [None]:
Finalised_Model = "Finalized_model.sav"
pickle.dump(regressor_dt, open(Finalised_Model, 'wb'))
print("Final model saved as Finalized_model.sav")


Final model saved as Finalized_model.sav
