Embedded technique part 1

In [1]:
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load red wine dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
df = pd.read_csv(url, delimiter=';')

# Separate features and target
X = df.drop('quality', axis=1)
y = df['quality']

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply Lasso
lasso = Lasso(alpha=0.01)
lasso.fit(X_scaled, y)

# Get selected features
selected_features_lasso = X.columns[lasso.coef_ != 0]
print("Features selected by Lasso:")
print(selected_features_lasso.tolist())

Features selected by Lasso:
['volatile acidity', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'pH', 'sulphates', 'alcohol']


Embedded technique part 2

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Load red wine dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
df = pd.read_csv(url, delimiter=';')

# Separate features and target
X = df.drop('quality', axis=1)
y = df['quality']

# Train Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)

# Calculate feature importance
importances = rf.feature_importances_
threshold = np.mean(importances)

# Select features above the threshold
selected_features_rf = X.columns[importances > threshold]
print("Features selected by Random Forest:")
print(selected_features_rf.tolist())

Features selected by Random Forest:
['volatile acidity', 'total sulfur dioxide', 'sulphates', 'alcohol']
