In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

class AnimalClassifier:
    def __init__(self, data):
        self.data = data
        self.X_train, self.X_test, self.y_train, self.y_test = None, None, None, None
        self.rf_model, self.knn_model = None, None
        self.rf_predictions, self.knn_predictions = None, None
        self.results = {}
    
    def omega_train_and_evaluate(self):
        """Comprehensive model training and evaluation"""
        # Part A: Prepare data and split
        self._prepare_and_split_data()
        
        # Part B: Configure and train Random Forest
        self._train_random_forest()
        
        # Part C: Print training performance
        self._evaluate_performance()
        
        # Part D: Classification report
        self._generate_classification_report()
        
        # Part E: Annotated confusion matrix heatmap (modified)
        self._plot_confusion_matrix_no_seaborn()
        
        # Part F: Feature importance plot
        self._plot_feature_importance_no_seaborn()
        
        # Part G: Comparison model - K-Nearest Neighbors
        self._train_knn()
        
        # Part H: Critical analysis output
        return self._generate_critical_analysis()
    
    def _prepare_and_split_data(self):
        """Prepare data and split into train/test sets"""
        # Create a copy of the data
        df = self.data.copy()
        
        # Select features - exclude non-predictive columns
        exclude_columns = ['animal_name', 'animal_name_normalized', 'class_type_info', 'class_number_info']
        feature_columns = [col for col in df.columns if col not in exclude_columns and col != 'class_type']
        
        # Handle categorical variables
        categorical_columns = df[feature_columns].select_dtypes(include=['object']).columns
        for col in categorical_columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
        
        # Prepare X and y
        X = df[feature_columns]
        y = df['class_type']
        
        # Part A: Split data (75% train, 25% test, random_state=123)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.25, random_state=123, stratify=y
        )
        
        print("Data preparation completed:")
        print(f"Training set: {self.X_train.shape[0]} samples")
        print(f"Testing set: {self.X_test.shape[0]} samples")
        print(f"Number of features: {self.X_train.shape[1]}")
    
    def _train_random_forest(self):
        """Configure and train Random Forest model"""
        # Part B: Configure Random Forest with specified parameters
        self.rf_model = RandomForestClassifier(
            n_estimators=150,
            max_depth=15,
            min_samples_split=2,
            random_state=123
        )
        
        self.rf_model.fit(self.X_train, self.y_train)
        
        # Make predictions
        self.rf_train_predictions = self.rf_model.predict(self.X_train)
        self.rf_test_predictions = self.rf_model.predict(self.X_test)
        
        print("Random Forest training completed")
    
    def _evaluate_performance(self):
        """Evaluate model performance"""
        # Part C: Calculate accuracies
        rf_train_accuracy = accuracy_score(self.y_train, self.rf_train_predictions)
        rf_test_accuracy = accuracy_score(self.y_test, self.rf_test_predictions)
        overfitting_gap = rf_train_accuracy - rf_test_accuracy
        
        print("\n=== RANDOM FOREST PERFORMANCE ===")
        print(f"Training Accuracy: {rf_train_accuracy:.4f}")
        print(f"Testing Accuracy: {rf_test_accuracy:.4f}")
        print(f"Overfitting Gap: {overfitting_gap:.4f}")
        
        self.results['rf_train_accuracy'] = rf_train_accuracy
        self.results['rf_test_accuracy'] = rf_test_accuracy
        self.results['overfitting_gap'] = overfitting_gap
    
    def _generate_classification_report(self):
        """Generate detailed classification report"""
        # Part D: Classification report
        print("\n=== CLASSIFICATION REPORT (Random Forest) ===")
        class_report = classification_report(self.y_test, self.rf_test_predictions, output_dict=True)
        print(classification_report(self.y_test, self.rf_test_predictions))
        
        # Store class-wise performance for analysis
        class_performance = {}
        for class_label, metrics in class_report.items():
            if class_label not in ['accuracy', 'macro avg', 'weighted avg']:
                class_performance[class_label] = {
                    'precision': metrics['precision'],
                    'recall': metrics['recall'],
                    'f1': metrics['f1-score']
                }
        
        self.results['class_report'] = class_report
        self.results['class_performance'] = class_performance
    
    def _plot_confusion_matrix_no_seaborn(self):
        """Plot confusion matrix without seaborn"""
        # Part E: Confusion matrix using matplotlib only
        cm = confusion_matrix(self.y_test, self.rf_test_predictions)
        
        class_names = {
            '1': 'Mammal', '2': 'Bird', '3': 'Reptile', 
            '4': 'Fish', '5': 'Amphibian', '6': 'Bug', '7': 'Invertebrate'
        }
        
        labels = [class_names.get(str(i), f'Class {i}') for i in sorted(self.y_test.unique())]
        
        fig, ax = plt.subplots(figsize=(10, 8))
        
        # Create heatmap using imshow
        im = ax.imshow(cm, cmap='Blues')
        
        # Add colorbar
        cbar = ax.figure.colorbar(im, ax=ax)
        cbar.ax.set_ylabel('Number of Predictions', rotation=-90, va="bottom")
        
        # Set ticks and labels
        ax.set_xticks(np.arange(len(labels)))
        ax.set_yticks(np.arange(len(labels)))
        ax.set_xticklabels(labels)
        ax.set_yticklabels(labels)
        
        # Rotate x labels
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
        
        # Add text annotations
        for i in range(len(labels)):
            for j in range(len(labels)):
                ax.text(j, i, cm[i, j], ha="center", va="center", color="black")
        
        ax.set_title('Confusion Matrix - Random Forest\n(n_estimators=150, max_depth=15, min_samples_split=2)', 
                    fontsize=14, fontweight='bold', pad=20)
        ax.set_xlabel('Predicted Label', fontsize=12)
        ax.set_ylabel('True Label', fontsize=12)
        
        plt.tight_layout()
        plt.show()
    
    def _plot_feature_importance_no_seaborn(self):
        """Plot feature importance without seaborn"""
        # Part F: Feature importance plot
        feature_importance = self.rf_model.feature_importances_
        feature_names = self.X_train.columns
        
        # Create DataFrame for sorting
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': feature_importance
        }).sort_values('importance', ascending=True)
        
        # Get top 12 features
        top_12 = importance_df.tail(12)
        
        # Identify engineered features
        engineered_features = ['predator_efficiency', 'habitat_mobility']
        
        plt.figure(figsize=(12, 8))
        
        # Create horizontal bar plot
        colors = ['lightblue' if feat not in engineered_features else 'orange' 
                 for feat in top_12['feature']]
        
        y_pos = np.arange(len(top_12))
        bars = plt.barh(y_pos, top_12['importance'], color=colors, alpha=0.7)
        
        # Add value labels on bars
        for i, bar in enumerate(bars):
            width = bar.get_width()
            plt.text(width + 0.001, bar.get_y() + bar.get_height()/2, 
                    f'{width:.3f}', ha='left', va='center', fontsize=9)
        
        plt.yticks(y_pos, top_12['feature'])
        plt.xlabel('Feature Importance Score', fontsize=12)
        plt.title('Top 12 Feature Importances - Random Forest\n(Engineered Features in Orange)', 
                 fontsize=14, fontweight='bold', pad=20)
        plt.grid(axis='x', alpha=0.3)
        plt.tight_layout()
        plt.show()
        
        self.results['feature_importance'] = importance_df
        self.results['top_features'] = top_12
    
    def _train_knn(self):
        """Train K-Nearest Neighbors comparison model"""
        # Part G: Train KNN model with k=5
        # Scale features for KNN
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(self.X_train)
        X_test_scaled = scaler.transform(self.X_test)
        
        self.knn_model = KNeighborsClassifier(n_neighbors=5)
        self.knn_model.fit(X_train_scaled, self.y_train)
        
        # Make predictions
        self.knn_predictions = self.knn_model.predict(X_test_scaled)
        knn_accuracy = accuracy_score(self.y_test, self.knn_predictions)
        
        print(f"\n=== K-NEAREST NEIGHBORS PERFORMANCE ===")
        print(f"KNN Testing Accuracy (k=5): {knn_accuracy:.4f}")
        
        self.results['knn_accuracy'] = knn_accuracy
    
    def _generate_critical_analysis(self):
        """Generate critical analysis output"""
        # Part H: Critical analysis
        analysis = []
        analysis.append("\n" + "="*50)
        analysis.append("CRITICAL MODEL ANALYSIS")
        analysis.append("="*50)
        
        # 1. Most important feature
        top_feature_row = self.results['feature_importance'].iloc[-1]
        analysis.append(f"1. Most important feature: '{top_feature_row['feature']}' (importance: {top_feature_row['importance']:.3f})")
        
        # 2. Worst performing class
        worst_class = None
        worst_f1 = 1.0
        for class_label, metrics in self.results['class_performance'].items():
            if metrics['f1'] < worst_f1:
                worst_f1 = metrics['f1']
                worst_class = class_label
        
        class_names = {
            '1': 'Mammal', '2': 'Bird', '3': 'Reptile', 
            '4': 'Fish', '5': 'Amphibian', '6': 'Bug', '7': 'Invertebrate'
        }
        worst_class_name = class_names.get(worst_class, f'Class {worst_class}')
        analysis.append(f"2. Worst performing class: {worst_class_name} (F1: {worst_f1:.3f})")
        
        # 3. Best performing class
        best_class = None
        best_f1 = 0.0
        for class_label, metrics in self.results['class_performance'].items():
            if metrics['f1'] > best_f1:
                best_f1 = metrics['f1']
                best_class = class_label
        
        best_class_name = class_names.get(best_class, f'Class {best_class}')
        analysis.append(f"3. Best performing class: {best_class_name} (F1: {best_f1:.3f})")
        
        # 4. Engineered feature ranking
        engineered_features = ['predator_efficiency', 'habitat_mobility']
        total_features = len(self.results['feature_importance'])
        
        for engineered_feat in engineered_features:
            if engineered_feat in self.results['feature_importance']['feature'].values:
                rank = total_features - self.results['feature_importance'][
                    self.results['feature_importance']['feature'] == engineered_feat
                ].index[0]
                importance = self.results['feature_importance'][
                    self.results['feature_importance']['feature'] == engineered_feat
                ]['importance'].values[0]
                analysis.append(f"4. Your engineered feature '{engineered_feat}' ranked #{rank} (importance: {importance:.3f})")
        
        # 5. Model comparison
        analysis.append(f"5. Model comparison: KNN (k=5) = {self.results['knn_accuracy']:.3f} vs RF = {self.results['rf_test_accuracy']:.3f}")
        
        # Additional insights
        analysis.append("\nADDITIONAL INSIGHTS:")
        analysis.append(f"• Random Forest outperforms KNN by {self.results['rf_test_accuracy'] - self.results['knn_accuracy']:.3f} accuracy points")
        analysis.append(f"• Overfitting gap is {'acceptable' if self.results['overfitting_gap'] < 0.1 else 'concerning'}: {self.results['overfitting_gap']:.3f}")
        
        if any(feat in self.results['top_features']['feature'].values for feat in engineered_features):
            analysis.append("• Engineered features are among the top contributors to model performance")
        else:
            analysis.append("• Engineered features have moderate impact on model performance")
        
        analysis.append("="*50)
        
        return "\n".join(analysis)

Matplotlib is building the font cache; this may take a moment.
