In [16]:
import sqlite3
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, RFE
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [17]:
# ✅ Step 1: Load Data from Database
db_path = "data/penguins.db"
conn = sqlite3.connect(db_path)
penguins = pd.read_sql("SELECT * FROM penguins_cleaned", conn)
conn.close()


In [18]:
# ✅ Step 2: Drop Unnecessary Columns & Handle Encoding
penguins.drop(columns=["penguin_id"], inplace=True)  # Drop ID column
penguins.dropna(inplace=True)  # Ensure no missing values
penguins["sex"] = penguins["sex"].map({"Male": 1, "Female": 0})  # Encode 'sex' as binary

In [19]:
# Step 3 Check the column names in the dataset
print("Columns in dataset:", penguins.columns.tolist())


Columns in dataset: ['species_id', 'island_id', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex']


In [None]:
# ✅ Step 4: Correlation Analysis (Filter Method)

# Define features and target
X = penguins[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
y = penguins['species_id']  # ✅ Use 'species_id' instead of 'species'

# Compute correlation matrix
correlation_matrix = X.corr()

# Plot heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Matrix")
plt.show()


KeyError: 'species'

In [21]:
print(penguins.columns)


Index(['species_id', 'island_id', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex'],
      dtype='object')


In [None]:

# ✅ Step 5: Mutual Information (Filter Method)
mi_scores = mutual_info_classif(X, y)
mi_df = pd.DataFrame({"Feature": X.columns, "MI Score": mi_scores})
mi_df = mi_df.sort_values(by="MI Score", ascending=False)
print("Mutual Information Scores:\n", mi_df)

Mutual Information Scores:
              Feature  MI Score
3  flipper_length_mm  0.613724
2      bill_depth_mm  0.571623
1     bill_length_mm  0.555825
4        body_mass_g  0.508248
0          island_id  0.488175
5                sex  0.018666


In [None]:
# ✅ Step 6: Recursive Feature Elimination (Wrapper Method) with Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

model = LogisticRegression(max_iter=1000)
rfe = RFE(model, n_features_to_select=3)
rfe.fit(X_scaled, y)
rfe_features = X.columns[rfe.support_]
print("Selected Features (RFE):", list(rfe_features))

Selected Features (RFE): ['bill_length_mm', 'bill_depth_mm', 'body_mass_g']


In [None]:
# ✅ Step 7: Lasso Regression (Embedded Method)
lasso = LassoCV().fit(X, y)
lasso_features = X.columns[lasso.coef_ != 0]
print("Selected Features (Lasso):", list(lasso_features))

Selected Features (Lasso): ['bill_length_mm', 'flipper_length_mm', 'body_mass_g']


In [None]:
c

Feature Importance (Random Forest):
              Feature  Importance
1     bill_length_mm    0.364737
3  flipper_length_mm    0.206220
2      bill_depth_mm    0.187398
4        body_mass_g    0.139891
0          island_id    0.093934
5                sex    0.007818


In [None]:
# ✅ Step 9: Select Final Features (Based on Analysis)
final_features = list(set(rfe_features) | set(lasso_features) | set(feature_importances.Feature[:4]))
X_selected = X[final_features]
print("Final Selected Features:", final_features)

Final Selected Features: ['bill_length_mm', 'flipper_length_mm', 'bill_depth_mm', 'body_mass_g']
