In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
df = sns.load_dataset('penguins')

# Check for missing values
print("Missing values in the dataset:")
print(df.isnull().sum())

# Impute missing values
numerical_columns = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
categorical_columns = ['sex', 'island']

num_impute = SimpleImputer(strategy='median')
df[numerical_columns] = num_impute.fit_transform(df[numerical_columns])

cat_impute = SimpleImputer(strategy='most_frequent')
df[categorical_columns] = cat_impute.fit_transform(df[categorical_columns])

# Check again for missing values
print("Missing values after imputation:")
print(df.isnull().sum())

# Separate features and target variable
X = df.drop(columns='species')
y = df['species']

# Encode the target variable
y_encode = LabelEncoder()
y = y_encode.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
ohe_encoded = OneHotEncoder(drop='first', sparse=False)

# ColumnTransformer for categorical and numerical columns
column_trans = ColumnTransformer(
    transformers=[
        ('island', ohe_encoded, ['island']),
        ('sex', ohe_encoded, ['sex'])
    ], remainder='passthrough'
)

# Preprocessing pipeline (imputation, one-hot encoding, scaling)
preprocessor = Pipeline(steps=[
    ('preprocessing', column_trans),
    ('scaler', StandardScaler())
])

# Define classifiers
classifiers = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC(),
    "Logistic Regression": LogisticRegression(),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

# Dictionary to store results
results = {}
pipelines = {}  # To store trained pipelines for predictions

# Train and evaluate classifiers
for name, clf in classifiers.items():
    print(f"\nTraining with {name}...")
    
    # Create a pipeline for each classifier
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', clf)
    ])
    
    # Train the pipeline
    pipeline.fit(X_train, y_train)
    
    # Save the trained pipeline for this classifier
    joblib.dump(pipeline, f'{name}_penguin_classifier.pkl')
    pipelines[name] = pipeline  # Store pipeline for predictions
    
    # Make predictions on the test set
    y_pred = pipeline.predict(X_test)
    
    # Calculate and store performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    cfm = confusion_matrix(y_test, y_pred)
    results[name] = {
        "accuracy": accuracy,
        "classification_report": classification_rep,
        "confusion_matrix": cfm
    }
    
    # Print results
    print(f"Accuracy for {name}: {accuracy}")
    print(f"Classification Report for {name}:\n{classification_rep}")
    print(f"Confusion Matrix for {name}:\n{cfm}")
    

# Summary of results for all classifiers
print("\nSummary of results for all classifiers:")
for name, metrics in results.items():
    print(f"\n{name} Classifier:")
    print(f"Accuracy: {metrics['accuracy']}")
    print(f"Classification Report:\n{metrics['classification_report']}")
    print(f"Confusion Matrix:\n{metrics['confusion_matrix']}")
    print('-'*50)

# Separate Section for Predictions
# Raw input data
raw_data = {
    'bill_length_mm': [44.1],
    'bill_depth_mm': [17.4],
    'flipper_length_mm': [192.0],
    'body_mass_g': [3400.0],
    'sex': ['Female'],
    'island': ['Torgersen']
}
raw_df = pd.DataFrame(raw_data)

# Ensure the input columns match the training data columns
expected_columns = X.columns.tolist()
raw_df = raw_df[expected_columns]

# Impute missing values (if necessary)
raw_df[numerical_columns] = num_impute.transform(raw_df[numerical_columns])
raw_df[categorical_columns] = cat_impute.transform(raw_df[categorical_columns])

# Make predictions using each classifier
print("\nPredictions for raw input data:")
for name, pipeline in pipelines.items():
    y_pred_raw = pipeline.predict(raw_df)
    predicted_class = y_encode.inverse_transform(y_pred_raw)
    print(f"Prediction using {name}: {predicted_class[0]}")


Missing values in the dataset:
species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64
Missing values after imputation:
species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

Training with Random Forest...
Accuracy for Random Forest: 1.0
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        16
           2       1.00      1.00      1.00        21

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69

Confusion Matrix for Random Forest:
[[32  0  0]
 [ 0 16  0]
 [ 0  0 21]]

Training with Decisio