<a href="https://colab.research.google.com/github/Suresh045/TNSDC/blob/main/Feature_Selection_classification__rfe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import RFE

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier


In [None]:
# Step 2: Load dataset
dataset = pd.read_csv("flightprice.csv")

# Preview dataset
print(dataset.head())
print(dataset.columns)


   duration  days_left   airline source_city departure_time  stops  \
0       223          4    IndiGo     Kolkata      Afternoon      0   
1       249         29     GoAir       Delhi          Night      0   
2       119         17     GoAir       Delhi      Afternoon      0   
3       131         26    IndiGo      Mumbai        Evening      0   
4        86          3  SpiceJet       Delhi        Evening      0   

  arrival_time destination_city     class  price  
0    Afternoon        Bangalore   Economy  14087  
1      Morning          Kolkata   Economy   6582  
2        Night          Kolkata  Business  12654  
3      Evening        Hyderabad   Economy   8514  
4      Evening          Chennai  Business  11785  
Index(['duration', 'days_left', 'airline', 'source_city', 'departure_time',
       'stops', 'arrival_time', 'destination_city', 'class', 'price'],
      dtype='object')


In [None]:
# Step 3: Define Independent (X) and Dependent (Y)

# Features: everything except "class"
indep_X = dataset[['duration', 'days_left', 'airline', 'source_city',
                   'departure_time', 'stops', 'arrival_time', 'destination_city']]

# Target: Predict flight class (Economy / Business)
dep_Y = dataset['class']


In [None]:
indep_X_encoded = pd.get_dummies(indep_X, drop_first=True)
display(indep_X_encoded.head())

Unnamed: 0,duration,days_left,stops,airline_GoAir,airline_IndiGo,airline_SpiceJet,airline_Vistara,source_city_Chennai,source_city_Delhi,source_city_Hyderabad,...,departure_time_Morning,departure_time_Night,arrival_time_Evening,arrival_time_Morning,arrival_time_Night,destination_city_Chennai,destination_city_Delhi,destination_city_Hyderabad,destination_city_Kolkata,destination_city_Mumbai
0,223,4,0,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,249,29,0,True,False,False,False,False,True,False,...,False,True,False,True,False,False,False,False,True,False
2,119,17,0,True,False,False,False,False,True,False,...,False,False,False,False,True,False,False,False,True,False
3,131,26,0,False,True,False,False,False,False,False,...,False,False,True,False,False,False,False,True,False,False
4,86,3,0,False,False,True,False,False,True,False,...,False,False,True,False,False,True,False,False,False,False


In [None]:
# Step 4: Encode Target Variable (important fix!)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dep_Y_encoded = label_encoder.fit_transform(dep_Y) # 'Business','Economy' → 0,1

# Step 5: One-hot encode categorical features
indep_X_encoded = pd.get_dummies(indep_X,
                                 columns=['airline', 'source_city', 'departure_time',
                                          'arrival_time', 'destination_city'],
                                 drop_first=True)

# Step 6: Feature Selection using RFE
def rfeFeature(indep_X, dep_Y, n):
    colnames_list = []
    acc_values = []

    classifiers = [
        LogisticRegression(max_iter=500),
        DecisionTreeClassifier(random_state=0),
        RandomForestClassifier(n_estimators=50, random_state=0),
        XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=1)
    ]

    for model in classifiers:
        selector = RFE(estimator=model, n_features_to_select=n)
        fit = selector.fit(indep_X, dep_Y)   # Now dep_Y is numeric (0,1,...)
        selected_columns = [col for col, selected in zip(indep_X.columns, selector.support_) if selected]
        colnames_list.append(selected_columns)

        # Train/Test split
        X_train, X_test, y_train, y_test = train_test_split(indep_X[selected_columns], dep_Y, test_size=0.25, random_state=0)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        acc_values.append(acc)

    return colnames_list, acc_values

# Step 7: Run RFE for top 5 features (with encoded features & target)
colnames_list, acc_values = rfeFeature(indep_X_encoded, dep_Y_encoded, 5)

# Step 8: Print results
for model_name, selected_columns, acc_value in zip(["Logistic", "DecisionTree", "RandomForest", "XGBoost"], colnames_list, acc_values):
    print(f"Model: {model_name}")
    print("Selected Features:", selected_columns)
    print(f"Accuracy: {acc_value}\n")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Model: Logistic
Selected Features: ['stops', 'airline_Vistara', 'source_city_Chennai', 'source_city_Mumbai', 'destination_city_Mumbai']
Accuracy: 0.56

Model: DecisionTree
Selected Features: ['duration', 'days_left', 'stops', 'source_city_Mumbai', 'destination_city_Kolkata']
Accuracy: 0.6

Model: RandomForest
Selected Features: ['duration', 'days_left', 'stops', 'airline_GoAir', 'destination_city_Kolkata']
Accuracy: 0.4

Model: XGBoost
Selected Features: ['airline_IndiGo', 'source_city_Hyderabad', 'source_city_Mumbai', 'departure_time_Night', 'destination_city_Chennai']
Accuracy: 0.48



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(indep_X_encoded, dep_Y_encoded, test_size=0.30, random_state=0)

# Scaling (good practice for Logistic/Boosting models)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Step 7: Train multiple classification models

# Logistic Regression
log_clf = LogisticRegression(max_iter=500)
log_clf.fit(X_train, y_train)
y_pred_log = log_clf.predict(X_test)
acc_log = accuracy_score(y_test, y_pred_log)

# Decision Tree
dt_clf = DecisionTreeClassifier(random_state=0)
dt_clf.fit(X_train, y_train)
y_pred_dt = dt_clf.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)

# Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)

# Gradient Boosting
gbr_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=0)
gbr_clf.fit(X_train, y_train)
y_pred_gbr = gbr_clf.predict(X_test)
acc_gbr = accuracy_score(y_test, y_pred_gbr)

# XGBoost
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=1)
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)
acc_xgb = accuracy_score(y_test, y_pred_xgb)

print("Accuracy Scores:")
print(f"Logistic Regression: {acc_log}")
print(f"Decision Tree: {acc_dt}")
print(f"Random Forest: {acc_rf}")
print(f"Gradient Boosting: {acc_gbr}")
print(f"XGBoost: {acc_xgb}")


Accuracy Scores:
Logistic Regression: 0.4
Decision Tree: 0.6
Random Forest: 0.4666666666666667
Gradient Boosting: 0.43333333333333335
XGBoost: 0.4666666666666667


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
# Step 8: Model Evaluation with Classification Report
def evaluate_model(y_test, y_pred, model_name):
    print(f"\n📊 {model_name} Evaluation")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

evaluate_model(y_test, y_pred_log, "Logistic Regression")
evaluate_model(y_test, y_pred_dt, "Decision Tree")
evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_gbr, "Gradient Boosting")
evaluate_model(y_test, y_pred_xgb, "XGBoost")



📊 Logistic Regression Evaluation
Accuracy: 0.4
Classification Report:
               precision    recall  f1-score   support

           0       0.41      0.64      0.50        14
           1       0.38      0.19      0.25        16

    accuracy                           0.40        30
   macro avg       0.39      0.42      0.38        30
weighted avg       0.39      0.40      0.37        30

Confusion Matrix:
 [[ 9  5]
 [13  3]]

📊 Decision Tree Evaluation
Accuracy: 0.6
Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.71      0.62        14
           1       0.67      0.50      0.57        16

    accuracy                           0.60        30
   macro avg       0.61      0.61      0.60        30
weighted avg       0.61      0.60      0.60        30

Confusion Matrix:
 [[10  4]
 [ 8  8]]

📊 Random Forest Evaluation
Accuracy: 0.4666666666666667
Classification Report:
               precision    recall  f1-score   suppo

In [None]:
# Step 9: Save the best model (Random Forest as example)
Finalised_Model = "Finalized_classifier.sav"
pickle.dump(rf_clf, open(Finalised_Model, 'wb'))
print("✅ Classification model saved as Finalized_classifier.sav")


✅ Classification model saved as Finalized_classifier.sav
