<a href="https://colab.research.google.com/github/Suresh045/TNSDC/blob/main/Feature_Selection_classification_kbest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier


In [None]:
# Step 2: Load dataset
dataset = pd.read_csv("flightprice.csv")

# Preview dataset
print(dataset.head())
print(dataset.columns)


   duration  days_left   airline source_city departure_time  stops  \
0       223          4    IndiGo     Kolkata      Afternoon      0   
1       249         29     GoAir       Delhi          Night      0   
2       119         17     GoAir       Delhi      Afternoon      0   
3       131         26    IndiGo      Mumbai        Evening      0   
4        86          3  SpiceJet       Delhi        Evening      0   

  arrival_time destination_city     class  price  
0    Afternoon        Bangalore   Economy  14087  
1      Morning          Kolkata   Economy   6582  
2        Night          Kolkata  Business  12654  
3      Evening        Hyderabad   Economy   8514  
4      Evening          Chennai  Business  11785  
Index(['duration', 'days_left', 'airline', 'source_city', 'departure_time',
       'stops', 'arrival_time', 'destination_city', 'class', 'price'],
      dtype='object')


In [None]:
# Step 3: Define Independent (X) and Dependent (Y)
indep_X = dataset[['duration', 'days_left', 'airline', 'source_city',
                   'departure_time', 'stops', 'arrival_time', 'destination_city']]

# Target variable (class: Economy / Business)
dep_Y = dataset['class']


In [None]:
# Step 4: Encode target (Economy/Business → 0/1)
label_encoder = LabelEncoder()
dep_Y_encoded = label_encoder.fit_transform(dep_Y)

# One-hot encode categorical features
indep_X_encoded = pd.get_dummies(indep_X,
                                 columns=['airline', 'source_city', 'departure_time',
                                          'arrival_time', 'destination_city'],
                                 drop_first=True)

print("Encoded Feature Columns:", indep_X_encoded.columns.tolist()[:10], "...")


Encoded Feature Columns: ['duration', 'days_left', 'stops', 'airline_GoAir', 'airline_IndiGo', 'airline_SpiceJet', 'airline_Vistara', 'source_city_Chennai', 'source_city_Delhi', 'source_city_Hyderabad'] ...


In [None]:
# Step 5: Feature Selection using SelectKBest (top 5 features)
def selectKBest_features(indep_X, dep_Y, k):
    selector = SelectKBest(score_func=f_classif, k=k)
    fit = selector.fit(indep_X, dep_Y)

    selected_mask = selector.get_support()
    selected_columns = indep_X.columns[selected_mask]

    # Transform dataset to reduced features
    X_new = selector.transform(indep_X)

    print("🎯 Top", k, "selected features (SelectKBest):")
    print(selected_columns.tolist())
    print("\nFeature Scores:")
    feature_scores = pd.DataFrame({
        "Feature": indep_X.columns,
        "Score": fit.scores_
    }).sort_values(by="Score", ascending=False)
    print(feature_scores.head(k))

    return X_new, selected_columns

# Run SelectKBest
X_new, selected_features = selectKBest_features(indep_X_encoded, dep_Y_encoded, 5)


🎯 Top 5 selected features (SelectKBest):
['duration', 'days_left', 'stops', 'source_city_Delhi', 'destination_city_Mumbai']

Feature Scores:
                    Feature     Score
2                     stops  3.851276
1                 days_left  3.223006
0                  duration  1.523757
22  destination_city_Mumbai  1.409482
8         source_city_Delhi  1.319681


In [None]:
# Step 6: Train-Test Split & Scaling
X_train, X_test, y_train, y_test = train_test_split(X_new, dep_Y_encoded, test_size=0.3, random_state=0)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
# Step 7: Train Classification Models

# Logistic Regression
log_clf = LogisticRegression(max_iter=500)
log_clf.fit(X_train, y_train)
y_pred_log = log_clf.predict(X_test)

# Decision Tree
dt_clf = DecisionTreeClassifier(random_state=0)
dt_clf.fit(X_train, y_train)
y_pred_dt = dt_clf.predict(X_test)

# Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

# Gradient Boosting
gbr_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=0)
gbr_clf.fit(X_train, y_train)
y_pred_gbr = gbr_clf.predict(X_test)

# XGBoost
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=1)
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
# Step 8: Evaluate Models
def evaluate_model(y_test, y_pred, model_name):
    print(f"\n📊 {model_name} Evaluation")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

evaluate_model(y_test, y_pred_log, "Logistic Regression")
evaluate_model(y_test, y_pred_dt, "Decision Tree")
evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_gbr, "Gradient Boosting")
evaluate_model(y_test, y_pred_xgb, "XGBoost")



📊 Logistic Regression Evaluation
Accuracy: 0.5666666666666667
Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.86      0.65        14
           1       0.71      0.31      0.43        16

    accuracy                           0.57        30
   macro avg       0.62      0.58      0.54        30
weighted avg       0.62      0.57      0.53        30

Confusion Matrix:
 [[12  2]
 [11  5]]

📊 Decision Tree Evaluation
Accuracy: 0.6
Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.64      0.60        14
           1       0.64      0.56      0.60        16

    accuracy                           0.60        30
   macro avg       0.60      0.60      0.60        30
weighted avg       0.61      0.60      0.60        30

Confusion Matrix:
 [[9 5]
 [7 9]]

📊 Random Forest Evaluation
Accuracy: 0.5666666666666667
Classification Report:
               precision    recall  f1-sc

In [None]:
# Step 9: Save the best model (example: Random Forest)
Finalised_Model = "Finalized_classifier_selectkbest.sav"
pickle.dump(rf_clf, open(Finalised_Model, 'wb'))
print("✅ Classification model saved as Finalized_classifier_selectkbest.sav")


✅ Classification model saved as Finalized_classifier_selectkbest.sav
