Importing Libraries

In [None]:
import sqlite3
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE, mutual_info_classif
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, classification_report
import joblib

Loading Data From DataBase

In [3]:
# Connect to the SQLite database
conn = sqlite3.connect("penguins.db")
query = """
SELECT p.species, p.bill_length_mm, p.bill_depth_mm, p.flipper_length_mm, p.body_mass_g, p.sex, i.name AS island
FROM penguins p
JOIN islands i ON p.island_id = i.island_id
"""
df = pd.read_sql(query, conn)
conn.close()

One-Hot Encoding For Categorical Features

In [4]:
# One-Hot Encode categorical features
encoder = OneHotEncoder(sparse_output=False)
categorical_cols = ['sex', 'island']
encoded_features = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))
df = df.drop(columns=categorical_cols).join(encoded_df)

In [5]:
df

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex_Female,sex_Male,island_Biscoe,island_Dream,island_Torgersen
0,Adelie,39.1,18.7,181.0,3750.0,0.0,1.0,0.0,0.0,1.0
1,Adelie,39.5,17.4,186.0,3800.0,1.0,0.0,0.0,0.0,1.0
2,Adelie,40.3,18.0,195.0,3250.0,1.0,0.0,0.0,0.0,1.0
3,Adelie,36.7,19.3,193.0,3450.0,1.0,0.0,0.0,0.0,1.0
4,Adelie,39.3,20.6,190.0,3650.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
328,Gentoo,47.2,13.7,214.0,4925.0,1.0,0.0,1.0,0.0,0.0
329,Gentoo,46.8,14.3,215.0,4850.0,1.0,0.0,1.0,0.0,0.0
330,Gentoo,50.4,15.7,222.0,5750.0,0.0,1.0,1.0,0.0,0.0
331,Gentoo,45.2,14.8,212.0,5200.0,1.0,0.0,1.0,0.0,0.0


Train - Test Split and Data Scaling

In [6]:
# Defining features and target
X = df.drop(columns=['species'])
y = df['species']

In [None]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Standardize features
scaler = StandardScaler()

# Fit on X_train and transform X_train
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Feature Selection Process

In [8]:
# 1. Filter Method (SelectKBest)
selector = SelectKBest(score_func=mutual_info_classif, k='all')  
selector.fit(X_train_scaled, y_train) 

# Mutual Information Scores
mi_scores = pd.Series(selector.scores_, index=X.columns)

mi_scores.sort_values(ascending=False, inplace=True) 

print("Mutual Information Scores (Filter Method):")
print(mi_scores)

Mutual Information Scores (Filter Method):
flipper_length_mm    0.592096
bill_depth_mm        0.589186
bill_length_mm       0.560094
body_mass_g          0.515296
island_Biscoe        0.411411
island_Dream         0.387965
island_Torgersen     0.113053
sex_Male             0.012188
sex_Female           0.000000
dtype: float64


In [9]:
# 2. Wrapper Method (Recursive Feature Elimination)
rfe_model = RandomForestClassifier(random_state=42)
rfe = RFE(estimator=rfe_model, n_features_to_select=None) 
rfe.fit(X_train_scaled, y_train)

# Extracting feature importance scores 
rfe_features = X.columns[rfe.support_] 
feature_importance = pd.Series(rfe.estimator_.feature_importances_, index=rfe_features)
feature_importance.sort_values(ascending=False, inplace=True)

print("\nRecursive Feature Elimination (RFE) Feature Importance Scores:")
print(feature_importance)


Recursive Feature Elimination (RFE) Feature Importance Scores:
bill_length_mm       0.392176
flipper_length_mm    0.295160
bill_depth_mm        0.168422
island_Dream         0.144242
dtype: float64


In [10]:
# 3. Embedded Method (Feature Importance from RandomForest)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

feature_importance = pd.Series(model.feature_importances_, index=X.columns) 
feature_importance.sort_values(ascending=False, inplace=True)

print("Random Forest Feature Importance (Embedded Method):")
print(feature_importance)

Random Forest Feature Importance (Embedded Method):
bill_length_mm       0.339403
flipper_length_mm    0.271055
bill_depth_mm        0.144621
island_Dream         0.074662
body_mass_g          0.072502
island_Biscoe        0.070673
island_Torgersen     0.020646
sex_Male             0.003245
sex_Female           0.003194
dtype: float64


In [11]:
# 4. Permutation Importance 
perm_importance = permutation_importance(model, X_test_scaled, y_test, scoring="accuracy") 

perm_scores = pd.Series(perm_importance.importances_mean, index=X.columns)
perm_scores.sort_values(ascending=False, inplace=True)

print("Permutation Importance Scores:")
print(perm_scores)

Permutation Importance Scores:
flipper_length_mm    0.262687
bill_length_mm       0.176119
island_Dream         0.149254
bill_depth_mm        0.023881
island_Biscoe        0.005970
island_Torgersen     0.002985
body_mass_g          0.002985
sex_Female           0.000000
sex_Male             0.000000
dtype: float64


In [12]:
# Creating Summary Table for Feature Selection
features = X.columns.tolist()

# function fro marking features for selection decision 
def mark_features(feature, methods):
    return "✅" if feature in methods else "❌"

# Table Creation
feature_table = pd.DataFrame({
    "Feature": features,
    "Mutual Info": [mark_features(f, mi_scores.head(3).index.tolist()) for f in features],
    "RFE": [mark_features(f, rfe_features) for f in features],
    "Random Forest": [mark_features(f, feature_importance.head(3).index.tolist()) for f in features],
    "Permutation": [mark_features(f, perm_scores.head(3).index.tolist()) for f in features]
})

feature_table[" Feature Importance"] = feature_table[["Mutual Info", "RFE", "Random Forest", "Permutation"]].apply(
    lambda row: "Important ✅" if list(row).count("✅") >= 3 else "Not Important ❌", axis=1
)

print("Feature Selection Results")
print(feature_table.to_string(index=False))

Feature Selection Results
          Feature Mutual Info RFE Random Forest Permutation  Feature Importance
   bill_length_mm           ✅   ✅             ✅           ✅         Important ✅
    bill_depth_mm           ✅   ✅             ✅           ❌         Important ✅
flipper_length_mm           ✅   ✅             ✅           ✅         Important ✅
      body_mass_g           ❌   ❌             ❌           ❌     Not Important ❌
       sex_Female           ❌   ❌             ❌           ❌     Not Important ❌
         sex_Male           ❌   ❌             ❌           ❌     Not Important ❌
    island_Biscoe           ❌   ❌             ❌           ❌     Not Important ❌
     island_Dream           ❌   ✅             ❌           ✅     Not Important ❌
 island_Torgersen           ❌   ❌             ❌           ❌     Not Important ❌


In [13]:
# Select features that are marked as "Important ✅"
final_features = feature_table.loc[feature_table[" Feature Importance"] == "Important ✅", "Feature"].tolist()

print("Final Selected Features:")
print(final_features)


Final Selected Features:
['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']


In [43]:
# Hyperparameter grid for RandomForest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', None],
    'random_state': [42]
}

# Using Oversampling (SMOTE) class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Model Initialization
rf = RandomForestClassifier()

# Using GridSearchCV for finding optimal estimators
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy', verbose=2)
grid_search.fit(X_train_resampled, y_train_resampled)
print("Best Parameters:", grid_search.best_params_)

y_pred = grid_search.best_estimator_.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Final Model Accuracy: {accuracy:.2f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best Parameters: {'class_weight': 'balanced', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 42}
Final Model Accuracy: 0.99

Classification Report:
              precision    recall  f1-score   support

      Adelie       1.00      0.97      0.98        29
   Chinstrap       0.93      1.00      0.97        14
      Gentoo       1.00      1.00      1.00        24

    accuracy                           0.99        67
   macro avg       0.98      0.99      0.98        67
weighted avg       0.99      0.99      0.99        67



In [44]:
# Save the model and scaler
joblib.dump(grid_search.best_estimator_, "penguin_classifier.pkl")
joblib.dump(scaler, "scaler.pkl")

print("Model trained and saved successfully!")

Model trained and saved successfully!


Testing Model

In [45]:
import numpy as np
from joblib import load
import warnings

warnings.filterwarnings('ignore')

# Load the saved model and scaler
model = load("penguin_classifier.pkl")
scaler = load("scaler.pkl")

# Example data for multiple penguins (bill_length, flipper_length, bill_depth)
test_data = np.array([
    [50.0, 200.0, 18.0],  # Example 1
    [45.0, 180.0, 17.0],  # Example 2
    [40.0, 190.0, 19.0],  # Example 3
    [60.0, 210.0, 20.0],  # Example 4
])

# Scale the features using the loaded scaler
scaled_features = scaler.transform(test_data)

# Print the scaled features to see if the transformation is working as expected
print("Scaled features:", scaled_features)

# Use the model to make predictions for all examples
predicted_classes = model.predict(scaled_features)

# Print predictions for all examples
for i, predicted_class in enumerate(predicted_classes):
    print(f"Prediction for penguin {i+1}: {predicted_class}")


Scaled features: [[  1.09915228  92.75303472 -13.07912805]
 [  0.18559614  82.60356005 -13.15048143]
 [ -0.72796     87.67829739 -13.00777467]
 [  2.92626457  97.82777206 -12.93642129]]
Prediction for penguin 1: Chinstrap
Prediction for penguin 2: Adelie
Prediction for penguin 3: Adelie
Prediction for penguin 4: Chinstrap
