In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Load the dataset from UCI ML Repository
url = "/content/winequality-red[1].csv"
df = pd.read_csv(url, delimiter=';')

# Display first few rows
print(df.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.model_selection import train_test_split

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
df = pd.read_csv(url, delimiter=';')

# Convert quality into binary classification (Good: 7-8, Bad: 3-6) for Chi-Square test
df['quality_category'] = np.where(df['quality'] >= 7, 1, 0)

# Separate features and target
X = df.drop(columns=['quality', 'quality_category'])
y = df['quality_category']  # Binary target for Chi-Square test
y_continuous = df['quality']  # Continuous target for ANOVA F-test

# Split data into train and test sets (not required for feature selection but good practice)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ============= 1️⃣ Chi-Square Test (For Categorical Target) =============
chi2_selector = SelectKBest(score_func=chi2, k=5)  # Select top 5 features
X_new_chi2 = chi2_selector.fit_transform(X_train, y_train)

# Print selected features
chi2_selected_features = X.columns[chi2_selector.get_support()]
print("Selected Features using Chi-Square Test:", chi2_selected_features)

# ============= 2️⃣ ANOVA F-test (Fisher’s Score) =============
anova_selector = SelectKBest(score_func=f_classif, k=5)  # Select top 5 features
X_new_anova = anova_selector.fit_transform(X_train, y_continuous[:len(X_train)])

# Print selected features
anova_selected_features = X.columns[anova_selector.get_support()]
print("Selected Features using ANOVA F-test (Fisher’s Score):", anova_selected_features)

Selected Features using Chi-Square Test: Index(['fixed acidity', 'citric acid', 'free sulfur dioxide',
       'total sulfur dioxide', 'alcohol'],
      dtype='object')
Selected Features using ANOVA F-test (Fisher’s Score): Index(['citric acid', 'residual sugar', 'free sulfur dioxide', 'density',
       'alcohol'],
      dtype='object')
