In [104]:
pip install ucimlrepo



In [105]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [106]:
import pandas as pd
from sklearn.feature_selection import mutual_info_classif

def get_top_mutual_info_features(X, y, top_k=5, discrete_features='auto'):
    """
    Calculates mutual information scores for all features in X with respect to the target y.

    Parameters:
    - X (pd.DataFrame): Feature dataset
    - y (array-like): Target variable (class labels)
    - top_k (int): Number of top features to return based on MI scores
    - discrete_features (bool, array-like, or ‘auto’): Whether to consider features as discrete

    Returns:
    - mi_scores (np.ndarray): Array of mutual information scores
    - mi_series (pd.Series): Series of MI scores labeled by feature names, sorted descending
    - top_features (pd.Index): Index of top_k features with highest MI scores
    """
    # Calculate mutual information scores
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features)

    # Create a pandas Series for labeled, sorted scores
    mi_series = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)

    # Select top_k features
    top_features = mi_series.index[:top_k]

    # Output (optional prints can be added if needed)
    return top_features


In [107]:
from sklearn.feature_selection import f_classif
'''
f_score -->>> how well each feature separates the classes.
A p-value is the probability of observing the data (or something more extreme) if the null hypothesis is true.
Probability that how much a feature contributing in identifying that feature is useful or not. Higher the p_value,lesser will the significance pf that feature. We can understand it like opposite as in definition abpve'''
def get_top_fisher_score_features(X,y,top_k=5):
  f_score,f_values=f_classif(X,y)
  # print(f_score,f_values)

  fisher_score=pd.Series(f_score,index=X.columns).sort_values(ascending=False)
  # print("Fisher Score\n\n",fisher_score)
  top_fisher_features=fisher_score.index[:top_k]
  return top_fisher_features






In [108]:
pip install skrebate



In [109]:
#Pearson correlation Feature selection method
def get_top_pearson_features(X,y,top_k=5):
  correlation=X.corrwith(y).abs().sort_values(ascending=False)
  top_corr_features=correlation.index[:top_k]
  return top_corr_features

In [110]:
#Relief correlaton Feature selection method
from skrebate import ReliefF

def get_top_relief_feature(X,y,top_k=5):
  relief=ReliefF(n_neighbors=100,n_features_to_select=X.shape[1])
  relief.fit(X.values,y.values)
  relief_score=pd.Series(relief.feature_importances_, index=X.columns).sort_values(ascending=False)
  top_relief_features=relief_score.index[:top_k]
  return top_relief_features

In [111]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
classifiers = {
    "GaussianNB": GaussianNB(),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "SVM": SVC(max_iter=5000, random_state=42),  # increased max_iter
    "KNN": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=5000, random_state=42)  # increased max_iter
}


In [112]:
#Function to evaluate accuracy,precision,recall and f1-score using above classifiers and then using voting classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

def evaluate_with_voting(X, y, classifiers, test_size=0.2, random_state=42):
    """
    Evaluates multiple classifiers + a VotingClassifier on given dataset.

    Parameters:
        X (pd.DataFrame): Features
        y (pd.Series): Target labels
        classifiers (dict): Dictionary of classifiers
        test_size (float): Test split ratio
        random_state (int): Random seed

    Returns:
        pd.DataFrame: Results with accuracy, precision, recall, f1-score
    """
    # Split into train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    results = []

    # --- Evaluate individual classifiers ---
    for clf_name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        results.append({
            "Classifier": clf_name,
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred, average="weighted", zero_division=0),
            "Recall": recall_score(y_test, y_pred, average="weighted", zero_division=0),
            "F1_Score": f1_score(y_test, y_pred, average="weighted", zero_division=0)
        })

    # --- Voting Classifier (combine all models) ---
    voting_clf = VotingClassifier(
        estimators=[(name, model) for name, model in classifiers.items()],
        voting='hard'   # could also try 'soft' if probabilities supported
    )
    voting_clf.fit(X_train, y_train)
    y_pred_voting = voting_clf.predict(X_test)

    results.append({
        "Classifier": "VotingClassifier",
        "Accuracy": accuracy_score(y_test, y_pred_voting),
        "Precision": precision_score(y_test, y_pred_voting, average="weighted", zero_division=0),
        "Recall": recall_score(y_test, y_pred_voting, average="weighted", zero_division=0),
        "F1_Score": f1_score(y_test, y_pred_voting, average="weighted", zero_division=0)
    })

    return pd.DataFrame(results)


In [113]:
#Wine Dataset
from sklearn.datasets import load_wine

wine=load_wine()

df=pd.DataFrame(wine.data,columns=wine.feature_names)
df["target"]=wine.target

X=df.drop(['target'],axis=1)
X
wine_y=df['target']
y

original_columns=X.columns
from sklearn.preprocessing import StandardScaler
scalar=StandardScaler()
X=scalar.fit_transform(X)
X

X=pd.DataFrame(X,columns=original_columns)
X

#Feature selection from different methods
wine_main_feature=get_top_mutual_info_features(X,wine_y,5)
wine_fisher_feature=get_top_fisher_score_features(X,wine_y,5)
wine_pearson_feature=get_top_pearson_features(X,wine_y,5)
wine_relief_feature=get_top_relief_feature(X,wine_y,5)

wine_main_feature_data=X[wine_main_feature]
wine_fisher_feature_data=X[wine_fisher_feature]
wine_pearson_feature_data=X[wine_pearson_feature]
wine_relief_feature_data=X[wine_relief_feature]


In [118]:
#Breast Cancer Dataset
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load dataset
cancer = load_breast_cancer()

# Convert to DataFrame
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df["target"] = cancer.target  # 0 = malignant, 1 = benign

print("Shape:", df.shape)
print("Target distribution:\n", df["target"].value_counts())

# Split into features (X) and target (y)
X = df.drop("target", axis=1)
cancer_y = df["target"]

# Standardize features (important for SVM, Logistic Regression, KNN, Relief, etc.)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame with column names
X = pd.DataFrame(X_scaled, columns=X.columns)

cancer_mutual_feature=get_top_mutual_info_features(X,cancer_y,8)
cancer_fisher_feature=get_top_fisher_score_features(X,cancer_y,8)
cancer_pearson_feature=get_top_pearson_features(X,cancer_y,8)
cancer_relief_feature=get_top_relief_feature(X,cancer_y,8)

cancer_mutual_feature_data=X[cancer_mutual_feature]
cancer_fisher_feature_data=X[cancer_fisher_feature]
cancer_pearson_feature_data=X[cancer_pearson_feature]
cancer_relief_feature_data=X[cancer_relief_feature]

Shape: (569, 31)
Target distribution:
 target
1    357
0    212
Name: count, dtype: int64


In [115]:
#Spambase Dataset
from ucimlrepo import fetch_ucirepo

spambase=fetch_ucirepo(id=94)
spambase

featuresData=spambase.data.features
targetData=spambase.data.targets
featuresData

spambase_full=pd.concat([featuresData,targetData],axis=1)
spambase_full.columns

X=spambase_full.drop(['Class'],axis=1)
spambase_y=spambase_full['Class']

original_columns=X.columns
from sklearn.preprocessing import StandardScaler
scalar=StandardScaler()
X=scalar.fit_transform(X)
X

X=pd.DataFrame(X,columns=original_columns)
X

spambase_main_feature=get_top_mutual_info_features(X,spambase_y,5)
spambase_fisher_feature=get_top_fisher_score_features(X,spambase_y,5)
spambase_pearson_feature=get_top_pearson_features(X,spambase_y,5)
spambase_relief_feature=get_top_relief_feature(X,spambase_y,5)

spambase_main_feature_data=X[spambase_main_feature]
spambase_fisher_feature_data=X[spambase_fisher_feature]
spambase_pearson_feature_data=X[spambase_pearson_feature]
spambase_relief_feature_data=X[spambase_relief_feature]

In [119]:
# For Breast cancer_main_feature_data
X_train,X_test,y_train,y_test=train_test_split(cancer_mutual_feature_data,cancer_y,test_size=0.2,random_state=42)
cancer_results = evaluate_with_voting(cancer_mutual_feature_data, cancer_y, classifiers)
print("For Cancer Dataset - Mutual Information: \n\n",cancer_results)
#For Breast cancer_fisher_feature_data
X_train,X_test,y_train,y_test=train_test_split(cancer_fisher_feature_data,cancer_y,test_size=0.2,random_state=42)
cancer_results = evaluate_with_voting(cancer_fisher_feature_data, cancer_y, classifiers)
print("For Cancer Dataset - Fisher Score: \n\n",cancer_results)
#For Breast cancer_pearson_feature_data
X_train,X_test,y_train,y_test=train_test_split(cancer_pearson_feature_data,cancer_y,test_size=0.2,random_state=42)
cancer_results = evaluate_with_voting(cancer_pearson_feature_data, cancer_y, classifiers)
print("For Cancer Dataset - Pearson Correlation : \n\n",cancer_results)
#For Breast cancer_relief_feature_data
X_train,X_test,y_train,y_test=train_test_split(cancer_relief_feature_data,cancer_y,test_size=0.2,random_state=42)
cancer_results = evaluate_with_voting(cancer_relief_feature_data, cancer_y, classifiers)
print("For Cancer Dataset - Relief : \n\n",cancer_results)


For Cancer Dataset - Mutual Information: 

            Classifier  Accuracy  Precision    Recall  F1_Score
0          GaussianNB  0.938596   0.939042  0.938596  0.938743
1        DecisionTree  0.912281   0.913671  0.912281  0.912683
2    GradientBoosting  0.947368   0.947440  0.947368  0.947087
3        RandomForest  0.964912   0.964912  0.964912  0.964912
4                 SVM  0.964912   0.965185  0.964912  0.964725
5                 KNN  0.947368   0.947368  0.947368  0.947368
6  LogisticRegression  0.947368   0.948462  0.947368  0.947610
7    VotingClassifier  0.947368   0.947368  0.947368  0.947368
For Cancer Dataset - Fisher Score: 

            Classifier  Accuracy  Precision    Recall  F1_Score
0          GaussianNB  0.929825   0.931066  0.929825  0.930146
1        DecisionTree  0.921053   0.923417  0.921053  0.921574
2    GradientBoosting  0.956140   0.956073  0.956140  0.956027
3        RandomForest  0.947368   0.947368  0.947368  0.947368
4                 SVM  0.956140   0.

In [120]:
#For wine_main_feature_data
X_train,X_test,y_train,y_test=train_test_split(wine_main_feature_data,wine_y,test_size=0.2,random_state=42)
wine_results = evaluate_with_voting(wine_main_feature_data, wine_y, classifiers)
print("For Wine Dataset - Mutual Information : \n\n",wine_results)
#For wine_fisher_feature_data
X_train,X_test,y_train,y_test=train_test_split(wine_fisher_feature_data,wine_y,test_size=0.2,random_state=42)
wine_results = evaluate_with_voting(wine_fisher_feature_data, wine_y, classifiers)
print("For Wine Dataset - Fisher Score: \n\n",wine_results)
#For wine_pearson_feature_data
X_train,X_test,y_train,y_test=train_test_split(wine_pearson_feature_data,wine_y,test_size=0.2,random_state=42)
wine_results = evaluate_with_voting(wine_pearson_feature_data, wine_y, classifiers)
print("For Wine Dataset - Pearson Correlation: \n\n",wine_results)
#For wine_relief_feature_data
X_train,X_test,y_train,y_test=train_test_split(wine_relief_feature_data,wine_y,test_size=0.2,random_state=42)
wine_results = evaluate_with_voting(wine_relief_feature_data, wine_y, classifiers)
print("For Wine Dataset - Relief: \n\n",wine_results)



For Wine Dataset - Mutual Information : 

            Classifier  Accuracy  Precision    Recall  F1_Score
0          GaussianNB  0.944444   0.949106  0.944444  0.943525
1        DecisionTree  1.000000   1.000000  1.000000  1.000000
2    GradientBoosting  1.000000   1.000000  1.000000  1.000000
3        RandomForest  1.000000   1.000000  1.000000  1.000000
4                 SVM  0.972222   0.974359  0.972222  0.972263
5                 KNN  0.972222   0.974359  0.972222  0.972263
6  LogisticRegression  1.000000   1.000000  1.000000  1.000000
7    VotingClassifier  1.000000   1.000000  1.000000  1.000000
For Wine Dataset - Fisher Score: 

            Classifier  Accuracy  Precision    Recall  F1_Score
0          GaussianNB  0.944444   0.949106  0.944444  0.943525
1        DecisionTree  1.000000   1.000000  1.000000  1.000000
2    GradientBoosting  1.000000   1.000000  1.000000  1.000000
3        RandomForest  0.972222   0.974359  0.972222  0.972263
4                 SVM  0.972222   0.974

In [121]:
# For spambase_main_feature_data
X_train,X_test,y_train,y_test=train_test_split(spambase_main_feature_data,spambase_y,test_size=0.2,random_state=42)
spambase_results = evaluate_with_voting(spambase_main_feature_data, spambase_y, classifiers)
print("For Spambase Dataset - Mutual Information : \n\n",spambase_results)
# spambase_fisher_feature_data
X_train,X_test,y_train,y_test=train_test_split(spambase_fisher_feature_data,spambase_y,test_size=0.2,random_state=42)
spambase_results = evaluate_with_voting(spambase_fisher_feature_data, spambase_y, classifiers)
print("For Spambase Dataset - Fisher Score: \n\n",spambase_results)
# spambase_pearson_feature_data
X_train,X_test,y_train,y_test=train_test_split(spambase_pearson_feature_data,spambase_y,test_size=0.2,random_state=42)
spambase_results = evaluate_with_voting(spambase_pearson_feature_data, spambase_y, classifiers)
print("For Spambase Dataset - Pearson Correlation: \n\n",spambase_results)
# spambase_relief_feature_data
X_train,X_test,y_train,y_test=train_test_split(spambase_relief_feature_data,spambase_y,test_size=0.2,random_state=42)
spambase_results = evaluate_with_voting(spambase_relief_feature_data, spambase_y, classifiers)
print("For Spambase Dataset - Relief: \n\n",spambase_results)

For Spambase Dataset - Mutual Information : 

            Classifier  Accuracy  Precision    Recall  F1_Score
0          GaussianNB  0.751357   0.791085  0.751357  0.723713
1        DecisionTree  0.861021   0.861021  0.861021  0.861021
2    GradientBoosting  0.869707   0.869684  0.869707  0.868432
3        RandomForest  0.890337   0.890429  0.890337  0.889444
4                 SVM  0.849077   0.850383  0.849077  0.846436
5                 KNN  0.853420   0.853074  0.853420  0.851937
6  LogisticRegression  0.827362   0.830180  0.827362  0.823011
7    VotingClassifier  0.878393   0.881508  0.878393  0.876118
For Spambase Dataset - Fisher Score: 

            Classifier  Accuracy  Precision    Recall  F1_Score
0          GaussianNB  0.796960   0.818970  0.796960  0.783297
1        DecisionTree  0.832790   0.832073  0.832790  0.830926
2    GradientBoosting  0.839305   0.840808  0.839305  0.836166
3        RandomForest  0.843648   0.842892  0.843648  0.842325
4                 SVM  0.832790