#DS Exp 7,8
apply filter feature selection techniques using dataset obtained from dataset in UCI ML

In [16]:
# Initial imports and data loading
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("grapes_new.csv")

# Assuming 'df' contains the features and labels, we separate the features and target
X = df.drop('CLASS', axis=1)
y = df['CLASS']

Data preprocessing and handling missing values

In [17]:
X_train_balanced = X.dropna()
y_train_balanced = y[X_train_balanced.index]

Mutual information based feature selection

In [21]:
from sklearn.feature_selection import mutual_info_classif

# Ensuring X_train_balanced has only numeric data
X_train_balanced = pd.get_dummies(X_train_balanced, drop_first=True)

# Ensuring y_train_balanced is numeric (if it is categorical)
if y_train_balanced.dtype == 'object':
    y_train_balanced = y_train_balanced.astype('category').cat.codes

# Compute Mutual Information between features and labels
mi_scores = mutual_info_classif(X_train_balanced, y_train_balanced)
mi_df = pd.DataFrame({'Feature': X_train_balanced.columns, 'Mutual_Info_Score': mi_scores})

# Setting a threshold to select important features
mi_threshold = 0.01
selected_mi_features = mi_df[mi_df['Mutual_Info_Score'] > mi_threshold]

# Count and display the selected features
n_selected_mi_features = len(selected_mi_features)
print(f"Total features selected: {n_selected_mi_features}")
print(f"Features chosen:\n{selected_mi_features}")


Total features selected: 17
Features chosen:
              Feature  Mutual_Info_Score
0     COLOR_INTENSITY           0.439667
1        RIPENESS_PER           0.698114
2         ALCOHOL_PER           0.364263
3      MALIC_ACID_PER           0.215712
5   ALCALINITY_OF_ASH           0.232628
6   MAGNESIUM_MEASURE           0.167803
7       TOTAL_PHENOLS           0.256008
8          FLAVANOIDS           0.502041
9     PROANTHOCYANINS           0.179042
10                HUE           0.286395
11            PROLINE           0.407579
12        TEMPERATURE           0.863644
13       AVG_HUMIDITY           0.249677
14     FERT_NITRO_PER           0.805711
15       WATER_O2_PER           0.813405
16           WATER_PH           0.545935
17       FER_P2O5_PER           0.258683


Chi Square

In [22]:
from sklearn.feature_selection import chi2

chi2_scores, chi2_pvals = chi2(X_train_balanced, y_train_balanced)
chi2_df = pd.DataFrame({'Feature': X_train_balanced.columns, 'Chi2_Score': chi2_scores, 'P-Value': chi2_pvals})

chi2_threshold = 10
p_value_limit = 0.05

selected_chi2_features = chi2_df[(chi2_df['Chi2_Score'] > chi2_threshold) & (chi2_df['P-Value'] < p_value_limit)]

n_selected_chi2_features = len(selected_chi2_features)
print(f"Features chosen via Chi-Square and p-value filtering: {n_selected_chi2_features}")
print(f"Selected features:\n{selected_chi2_features}")


Features chosen via Chi-Square and p-value filtering: 11
Selected features:
              Feature    Chi2_Score       P-Value
0     COLOR_INTENSITY    148.100611  6.924060e-33
1        RIPENESS_PER     28.974040  5.109368e-07
3      MALIC_ACID_PER     20.083382  4.354608e-05
5   ALCALINITY_OF_ASH     28.475270  6.556523e-07
6   MAGNESIUM_MEASURE     36.703588  1.071315e-08
7       TOTAL_PHENOLS     12.413487  2.015792e-03
8          FLAVANOIDS     50.201153  1.255909e-11
11            PROLINE  13021.687652  0.000000e+00
12        TEMPERATURE     52.146437  4.748376e-12
15       WATER_O2_PER     46.198502  9.292290e-11
17       FER_P2O5_PER    126.341988  3.674396e-28


Pearson coefficient

In [23]:
# Step 1: Calculate Pearson correlation of each feature with the target
correlation_vals = X_train_balanced.corrwith(y_train_balanced)

# Step 2: Create a DataFrame to map feature names to their correlation values
corr_df = pd.DataFrame({'Feature': X_train_balanced.columns, 'Pearson_Correlation': correlation_vals})

# Step 3: Set a minimum correlation threshold
correlation_cutoff = 0.1

# Step 4: Retain only features whose absolute correlation exceeds the threshold
selected_corr_features = corr_df[abs(corr_df['Pearson_Correlation']) > correlation_cutoff]

# Step 5: Sort the selected features by their absolute correlation values
selected_corr_features_sorted = selected_corr_features.sort_values(by='Pearson_Correlation', ascending=False)

# Step 6: Display the final selected features and their Pearson correlation values
n_selected_corr_features = len(selected_corr_features_sorted)
print(f"Total features chosen based on Pearson correlation: {n_selected_corr_features}")
print(f"Features selected:\n{selected_corr_features_sorted}")

Total features chosen based on Pearson correlation: 18
Features selected:
                             Feature  Pearson_Correlation
FER_P2O5_PER            FER_P2O5_PER             0.468916
RIPENESS_PER            RIPENESS_PER             0.454740
TEMPERATURE              TEMPERATURE             0.426633
ALCALINITY_OF_ASH  ALCALINITY_OF_ASH             0.335672
AVG_HUMIDITY            AVG_HUMIDITY             0.133431
HUE                              HUE             0.114155
MALIC_ACID_PER        MALIC_ACID_PER            -0.132241
PROANTHOCYANINS      PROANTHOCYANINS            -0.147289
SOIL_TYPE_archean  SOIL_TYPE_archean            -0.158108
FLAVANOIDS                FLAVANOIDS            -0.282259
TOTAL_PHENOLS          TOTAL_PHENOLS            -0.313190
MAGNESIUM_MEASURE  MAGNESIUM_MEASURE            -0.313197
ASH                              ASH            -0.328715
FERT_NITRO_PER        FERT_NITRO_PER            -0.348544
WATER_O2_PER            WATER_O2_PER            -0.44596