In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import joblib
import os
from typing import Tuple, Dict, Any, List, Optional
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from abc import ABC, abstractmethod
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore')


In [2]:
# Loading the data
cleaned_tweets_df = pd.read_csv('data/cleaned_apple_tweets.csv')
cleaned_tweets_df.head()

Unnamed: 0,tweet,product,tokens,processed_tweet,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,"['g', 'iphon', 'hr', 'tweet', 'dead', 'need', ...",g iphon hr tweet dead need upgrad plugin station,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,"['know', 'awesom', 'ipadiphon', 'app', 'youll'...",know awesom ipadiphon app youll like appreci d...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,"['wait', 'also', 'sale']",wait also sale,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,"['hope', 'year', 'festiv', 'isnt', 'crashi', '...",hope year festiv isnt crashi year iphon app,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,"['great', 'stuff', 'fri', 'marissa', 'mayer', ...",great stuff fri marissa mayer googl tim oreill...,Positive emotion


# Base Data Processor Class

This module provides a base class for data preprocessing operations with the following functionality:

## Features

- **Data Loading**: Supports loading data from either a file path or an existing DataFrame
- **Basic Preprocessing**:
  - Handles missing values in key columns
  - Cleans string labels by stripping whitespace
- **Label Encoding**:
  - Converts string labels to numeric values
  - Provides mapping between numeric labels and original string labels

## Class Overview

The `BaseDataProcessor` class serves as a foundation for more specialized data processing implementations. It includes:
- A LabelEncoder for converting string labels to numeric values
- Placeholder for a vectorizer (to be implemented in subclasses)
- Tracking of feature names

In [3]:
class BaseDataProcessor:
    """Base class for data preprocessing operations"""
    
    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.vectorizer = None
        self.feature_names = None
    
    def load_data(self, data_path: str = None, data_df: pd.DataFrame = None) -> pd.DataFrame:
        """Load data from file or dataframe"""
        if data_df is not None:
            return data_df
        elif data_path is not None:
            return pd.read_csv(data_path)
        else:
            raise ValueError("Either data_path or data_df must be provided")
    
    def preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Basic preprocessing steps"""
        # Handle missing values
        df = df.dropna(subset=['processed_tweet', 'sentiment'])
        
        # Clean sentiment labels
        df['sentiment'] = df['sentiment'].str.strip()
        
        return df
    
    def encode_labels(self, y: pd.Series) -> np.ndarray:
        """Encode string labels to numeric"""
        return self.label_encoder.fit_transform(y)
    
    def get_label_mapping(self) -> Dict[int, str]:
        """Get mapping from numeric labels to string labels"""
        return {i: label for i, label in enumerate(self.label_encoder.classes_)}

# Feature Extractor Class

Extends `BaseDataProcessor` to handle text feature extraction using different vectorization techniques.

## Key Functionality

- **Vectorizer Configuration**: Supports both TF-IDF and Count Vectorizer methods
- **Feature Transformation**: Converts text data into numerical features
- **Feature Analysis**: Provides feature importance extraction for model interpretation

## Configuration Options

### Vectorizer Types
1. **TF-IDF** (`vectorizer_type='tfidf'`)
   - Default option with unigram features
   - Removes stopwords and converts to lowercase
2. **Count Vectorizer** (`vectorizer_type='count'`)
   - Includes both unigrams and bigrams
   - Same text normalization as TF-IDF

### Parameters
- `max_features`: Controls vocabulary size (default: 1000)
- `ngram_range`:
  - (1,1) for TF-IDF (unigrams only)
  - (1,2) for Count Vectorizer (unigrams + bigrams)

## Core Methods

### `fit_transform_features`
- Transforms text data into feature matrices
- Handles both training and test data with proper feature alignment
- Stores feature names for reference

### `get_feature_importance`
- Works with:
  - Tree-based models (using `feature_importances_`)
  - Linear models (using absolute coefficients)
- Returns ranked features DataFrame

In [4]:
class FeatureExtractor(BaseDataProcessor):
    """Class for extracting features from text data"""
    
    def __init__(self, vectorizer_type: str = 'tfidf', max_features: int = 1000):
        super().__init__()
        self.vectorizer_type = vectorizer_type
        self.max_features = max_features
        self._initialize_vectorizer()
    
    def _initialize_vectorizer(self):
        """Initialize the appropriate vectorizer"""
        if self.vectorizer_type == 'tfidf':
            self.vectorizer = TfidfVectorizer(
                max_features=self.max_features,
                stop_words='english',
                lowercase=True,
                ngram_range=(1, 1)
            )
        elif self.vectorizer_type == 'count':
            self.vectorizer = CountVectorizer(
                max_features=self.max_features,
                stop_words='english',
                lowercase=True,
                ngram_range=(1, 2)
            )
        else:
            raise ValueError("vectorizer_type must be 'tfidf' or 'count'")
    
    def fit_transform_features(self, X_train: pd.Series, X_test: pd.Series = None) -> Tuple[np.ndarray, np.ndarray]:
        """Fit vectorizer on training data and transform both train and test"""
        X_train_vec = self.vectorizer.fit_transform(X_train)
        self.feature_names = self.vectorizer.get_feature_names()
        
        if X_test is not None:
            X_test_vec = self.vectorizer.transform(X_test)
            return X_train_vec.toarray(), X_test_vec.toarray()
        
        return X_train_vec.toarray(), None
    
    def get_feature_importance(self, model, top_n: int = 20) -> pd.DataFrame:
        """Get feature importance for models that support it"""
        if hasattr(model, 'feature_importances_'):
            importance = model.feature_importances_
        elif hasattr(model, 'coef_'):
            importance = np.abs(model.coef_).mean(axis=0)
        else:
            raise ValueError("Model doesn't have feature importance or coefficients")
        
        feature_importance = pd.DataFrame({
            'feature': self.feature_names,
            'importance': importance
        }).sort_values('importance', ascending=False)
        
        return feature_importance.head(top_n)

# Data Splitter Class

Handles stratified splitting of datasets into train, validation, and test sets with reproducible randomization.

## Key Features

- **Stratified Splitting**: Maintains original class distribution in all splits
- **Flexible Sizing**: Configurable test and validation set sizes
- **Reproducibility**: Fixed random state for consistent results across runs

## Splitting Strategy

1. **First Split**:
   - Separates test set from the full dataset (`test_size`)
   - Remaining data kept for training+validation

2. **Second Split**:
   - Divides the remaining data into training and validation sets
   - Automatically adjusts validation size relative to remaining data

```python
# Visual representation of the splitting logic
Full Dataset
├── Test Set (20%)
└── Training+Validation (80%)
    ├── Training Set (64%)
    └── Validation Set (16%)

In [5]:
class DataSplitter:
    """Class for splitting data into train/validation/test sets"""
    
    def __init__(self, test_size: float = 0.2, val_size: float = 0.2, random_state: int = 42):
        self.test_size = test_size
        self.val_size = val_size
        self.random_state = random_state
    
    def split_data(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """Split data into train, validation, and test sets"""
        # First split: train+val vs test
        X_temp, X_test, y_temp, y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_state, stratify=y
        )
        
        # Second split: train vs val
        val_size_adjusted = self.val_size / (1 - self.test_size)
        X_train, X_val, y_train, y_val = train_test_split(
            X_temp, y_temp, test_size=val_size_adjusted, random_state=self.random_state, stratify=y_temp
        )
        
        return X_train, X_val, X_test, y_train, y_val, y_test

# Model Evaluator Class

Provides comprehensive model evaluation capabilities including metrics calculation, visualization, and model comparison.

## Key Features

- **Multi-metric Evaluation**: Computes accuracy, classification report, and confusion matrix
- **Results Tracking**: Stores evaluation results for multiple models
- **Visualization**: Includes built-in plotting functions
- **Model Comparison**: Quantitative and visual comparison of model performance

## Evaluation Metrics

| Metric | Description | Output Format |
|--------|-------------|---------------|
| Accuracy | Overall classification accuracy | float (0-1) |
| Classification Report | Precision, recall, f1-score per class | sklearn string report |
| Confusion Matrix | Class-wise prediction counts | 2D numpy array |
| Predictions | Raw model predictions | numpy array |

## Core Methods

### `evaluate_model`
```python
evaluate_model(model, X_test, y_test, model_name) -> dict

In [6]:
class ModelEvaluator:
    """Class for evaluating model performance"""
    
    def __init__(self):
        self.results = {}
    
    def evaluate_model(self, model, X_test: np.ndarray, y_test: np.ndarray, model_name: str) -> Dict[str, Any]:
        """Evaluate model performance"""
        y_pred = model.predict(X_test)
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'classification_report': classification_report(y_test, y_pred),
            'confusion_matrix': confusion_matrix(y_test, y_pred),
            'predictions': y_pred
        }
        
        self.results[model_name] = results
        return results
    
    def plot_confusion_matrix(self, model_name: str, labels: List[str] = None):
        """Plot confusion matrix for a model"""
        if model_name not in self.results:
            raise ValueError(f"Model {model_name} not found in results")
        
        cm = self.results[model_name]['confusion_matrix']
        
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=labels, yticklabels=labels)
        plt.title(f'Confusion Matrix - {model_name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.tight_layout()
        plt.show()
    
    def compare_models(self) -> pd.DataFrame:
        """Compare accuracy of all evaluated models"""
        comparison = pd.DataFrame([
            {'Model': name, 'Accuracy': results['accuracy']}
            for name, results in self.results.items()
        ]).sort_values('Accuracy', ascending=False)
        
        return comparison
    
    def plot_model_comparison(self):
        """Plot comparison of model accuracies"""
        comparison = self.compare_models()
        
        plt.figure(figsize=(10, 6))
        bars = plt.bar(comparison['Model'], comparison['Accuracy'])
        plt.title('Model Accuracy Comparison')
        plt.xlabel('Model')
        plt.ylabel('Accuracy')
        plt.xticks(rotation=45)
        
        # Add value labels on bars
        for bar, acc in zip(bars, comparison['Accuracy']):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
                    f'{acc:.3f}', ha='center', va='bottom')
        
        plt.tight_layout()
        plt.show()

# Implementation Testing

This section demonstrates the usage of the base classes through a practical test script.

## Test Script Overview

The script verifies:
1. Data preprocessing pipeline
2. Label encoding functionality
3. Feature extraction capabilities


In [7]:
# Usage and testing
if __name__ == "__main__": 
    # Test the base classes
    df = cleaned_tweets_df
    
    # Test data processor
    processor = BaseDataProcessor()
    df_clean = processor.preprocess_data(df)
    y_encoded = processor.encode_labels(df_clean['sentiment'])
    label_mapping = processor.get_label_mapping()
    
    print("Data processing test:")
    print(f"Original shape: {df.shape}")
    print(f"Clean shape: {df_clean.shape}")
    print(f"Label mapping: {label_mapping}")
    print(f"Encoded labels: {y_encoded}")
    
    # Test feature extractor
    feature_extractor = FeatureExtractor(vectorizer_type='tfidf', max_features=100)
    X_train, _ = feature_extractor.fit_transform_features(df_clean['processed_tweet'])
    
    print(f"\nFeature extraction test:")
    print(f"Feature matrix shape: {X_train.shape}")
    print(f"Number of features: {len(feature_extractor.feature_names)}")
    
    print("\nBase classes setup complete!")

Data processing test:
Original shape: (9070, 5)
Clean shape: (9068, 5)
Label mapping: {0: "I can't tell", 1: 'Negative emotion', 2: 'No emotion toward brand or product', 3: 'Positive emotion'}
Encoded labels: [1 3 3 ... 2 2 2]

Feature extraction test:
Feature matrix shape: (9068, 100)
Number of features: 100

Base classes setup complete!


# Base Sentiment Classifier (ABC)

Abstract base class defining the interface and common functionality for all sentiment classification models.

## Class Overview

### Core Responsibilities
- Standardizes model initialization and training
- Provides common evaluation methods
- Implements hyperparameter tuning workflow
- Handles model serialization

### Key Features
- **Abstract Methods**: Requires concrete implementations for model-specific logic
- **State Tracking**: Maintains model fitting status
- **Hyperparameter Management**: Supports grid search tuning
- **Cross-Validation**: Built-in performance evaluation
- **Model Persistence**: Save/load functionality

## Abstract Methods

| Method | Description | Returns |
|--------|-------------|---------|
| `_initialize_model()` | Creates model instance | Model object |
| `get_hyperparameters()` | Defines parameter grid for tuning | Dictionary of parameter lists |

In [8]:
class BaseSentimentClassifier(ABC):
    """Abstract base class for sentiment classification models"""
    
    def __init__(self, model_name: str):
        self.model_name = model_name
        self.model = None
        self.is_fitted = False
        self.best_params = None
        self.cv_scores = None
    
    @abstractmethod
    def _initialize_model(self) -> Any:
        """Initialize the specific model"""
        pass
    
    @abstractmethod
    def get_hyperparameters(self) -> Dict[str, Any]:
        """Get hyperparameters for grid search"""
        return {
            'C': [0.1, 1, 10],
            'solver': ['liblinear'],
            'penalty': ['l2']
        }
    
    def fit(self, X_train: np.ndarray, y_train: np.ndarray):
        """Fit the model"""
        if self.model is None:
            self.model = self._initialize_model()
        
        self.model.fit(X_train, y_train)
        self.is_fitted = True
        return self
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        """Make predictions"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before making predictions")
        return self.model.predict(X)
    
    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        """Get prediction probabilities"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before making predictions")
        return self.model.predict_proba(X)
    
    def hyperparameter_tuning(self, X_train: np.ndarray, y_train: np.ndarray, 
                            cv: int = 3, scoring: str = 'accuracy') -> Dict[str, Any]:
        """Perform hyperparameter tuning using GridSearchCV"""
        if self.model is None:
            self.model = self._initialize_model()
        
        param_grid = self.get_hyperparameters()
        
        grid_search = GridSearchCV(
            self.model, param_grid, cv=cv, scoring=scoring, 
            n_jobs=-1, verbose=1
        )
        
        grid_search.fit(X_train, y_train)
        
        self.model = grid_search.best_estimator_
        self.best_params = grid_search.best_params_
        self.is_fitted = True
        
        return {
            'best_params': self.best_params,
            'best_score': grid_search.best_score_,
            'cv_results': grid_search.cv_results_
        }
    
    def cross_validate(self, X: np.ndarray, y: np.ndarray, cv: int = 5) -> Dict[str, float]:
        """Perform cross-validation"""
        if self.model is None:
            self.model = self._initialize_model()
        
        scores = cross_val_score(self.model, X, y, cv=cv, scoring='accuracy')
        self.cv_scores = scores
        
        return {
            'mean_score': scores.mean(),
            'std_score': scores.std(),
            'scores': scores
        }
    
    def save_model(self, filepath: str):
        """Save the trained model"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before saving")
        
        joblib.dump({
            'model': self.model,
            'model_name': self.model_name,
            'best_params': self.best_params
        }, filepath)
    
    def load_model(self, filepath: str):
        """Load a trained model"""
        data = joblib.load(filepath)
        self.model = data['model']
        self.model_name = data['model_name']
        self.best_params = data.get('best_params', None)
        self.is_fitted = True

# Logistic Regression Classifier

Concrete implementation of `BaseSentimentClassifier` for logistic regression-based sentiment analysis.

## Class Overview

### Inheritance
- Extends `BaseSentimentClassifier`
- Implements all required abstract methods
- Adds logistic regression-specific configuration

### Key Features
- Configurable random state for reproducibility
- Optimized for sentiment analysis (multi-class)
- Comprehensive hyperparameter grid for tuning
- Standard scikit-learn estimator interface

In [9]:
class LogisticRegressionClassifier(BaseSentimentClassifier):
    """Logistic Regression classifier for sentiment analysis"""
    
    def __init__(self, random_state: int = 42):
        super().__init__("Logistic Regression")
        self.random_state = random_state
    
    def _initialize_model(self) -> LogisticRegression:
        return LogisticRegression(
            random_state=self.random_state,
            max_iter=1000,
            multi_class='ovr'
        )
    
    def get_hyperparameters(self) -> Dict[str, Any]:
        return {
            'C': [0.1, 1, 10, 100],
            'solver': ['liblinear', 'lbfgs'],
            'penalty': ['l1', 'l2']
        }


# Random Forest Sentiment Classifier

Implementation of `BaseSentimentClassifier` using Random Forest for sentiment analysis tasks.

## Class Overview

### Inheritance
- Extends `BaseSentimentClassifier` abstract class
- Implements all required abstract methods
- Configures Random Forest-specific parameters

### Key Features
- Parallelized training (`n_jobs=-1`)
- Comprehensive hyperparameter tuning grid
- Built-in feature importance analysis
- Handles class imbalance through weighting

In [10]:
class RandomForestSentimentClassifier(BaseSentimentClassifier):
    """Random Forest classifier for sentiment analysis"""
    
    def __init__(self, random_state: int = 42):
        super().__init__("Random Forest")
        self.random_state = random_state
    
    def _initialize_model(self) -> RandomForestClassifier:
        return RandomForestClassifier(
            random_state=self.random_state,
            class_weight='balanced',
            n_jobs=-1
        )
    
    def get_hyperparameters(self) -> Dict[str, Any]:
        return {
            'n_estimators': [50, 100, 200],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }

# Decision Tree Sentiment Classifier

Implementation of `BaseSentimentClassifier` using a Decision Tree algorithm for sentiment analysis.

## Class Overview

### Inheritance
- Extends `BaseSentimentClassifier` abstract class
- Implements all required abstract methods
- Configures Decision Tree-specific parameters

### Key Features
- Interpretable model structure
- Configurable splitting criteria
- Pruning controls to prevent overfitting
- Feature importance scores

In [11]:
class DecisionTreeSentimentClassifier(BaseSentimentClassifier):
    """Decision Tree classifier for sentiment analysis"""
    
    def __init__(self, random_state: int = 42):
        super().__init__("Decision Tree")
        self.random_state = random_state
    
    def _initialize_model(self) -> DecisionTreeClassifier:
        return DecisionTreeClassifier(
            random_state=self.random_state
        )
    
    def get_hyperparameters(self) -> Dict[str, Any]:
        return {
            'max_depth': [5, 10, 15, 20, None],
            'min_samples_split': [2, 5, 10],  
            'min_samples_leaf': [1, 2, 4],
            'criterion': ['gini', 'entropy']
        }

# Gradient Boosting Sentiment Classifier

Implementation of `BaseSentimentClassifier` using Gradient Boosting Machines (GBM) for sentiment analysis.

## Class Overview

### Inheritance
- Extends `BaseSentimentClassifier` abstract class
- Implements all required abstract methods
- Configures GBM-specific parameters

### Key Features
- Sequential ensemble of decision trees
- Adaptive boosting (AdaBoost) mechanism
- Configurable learning rate
- Built-in feature importance

In [12]:
class GradientBoostingSentimentClassifier(BaseSentimentClassifier):
    """Gradient Boosting classifier for sentiment analysis"""
    
    def __init__(self, random_state: int = 42):
        super().__init__("Gradient Boosting")
        self.random_state = random_state
    
    def _initialize_model(self) -> GradientBoostingClassifier:
        return GradientBoostingClassifier(
            random_state=self.random_state
        )
    
    def get_hyperparameters(self) -> Dict[str, Any]:
        return {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.05, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'min_samples_split': [2, 5, 10]
        }

# XGBoost Sentiment Classifier

Implementation of `BaseSentimentClassifier` using XGBoost (Extreme Gradient Boosting) for sentiment analysis.

## Class Overview

### Inheritance
- Extends `BaseSentimentClassifier` abstract class
- Implements all required abstract methods
- Configures XGBoost-specific parameters

### Key Features
- Optimized gradient boosting implementation
- Built-in regularization to prevent overfitting
- Parallel processing support
- Advanced features for handling imbalanced data


In [13]:
class XGBoostSentimentClassifier(BaseSentimentClassifier):
    """XGBoost classifier for sentiment analysis"""
    
    def __init__(self, random_state: int = 42):
        super().__init__("XGBoost")
        self.random_state = random_state
    
    def _initialize_model(self) -> Any:
        import xgboost as xgb
        return xgb.XGBClassifier(
            random_state=self.random_state,
            use_label_encoder=False,
            eval_metric='mlogloss'
        )
    
    def get_hyperparameters(self) -> Dict[str, Any]:
        return {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2],
            'subsample': [0.8, 1.0]
        }

# K-Nearest Neighbors Sentiment Classifier

Implementation of `BaseSentimentClassifier` using K-Nearest Neighbors algorithm for sentiment analysis.

## Class Overview

### Inheritance
- Extends `BaseSentimentClassifier` abstract class
- Implements all required abstract methods
- Configures KNN-specific parameters

### Key Features
- Instance-based learning (no explicit training)
- Flexible distance metrics
- Weighted voting options
- Naturally handles multi-class classification

In [14]:
class KNNSentimentClassifier(BaseSentimentClassifier):
    """K-Nearest Neighbors classifier for sentiment analysis"""
    
    def __init__(self):
        super().__init__("K-Nearest Neighbors")
    
    def _initialize_model(self) -> KNeighborsClassifier:
        return KNeighborsClassifier()
    
    def get_hyperparameters(self) -> Dict[str, Any]:
        return {
            'n_neighbors': [3, 5, 7, 9, 11],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan', 'cosine']
        }

# Naive Bayes Sentiment Classifier

Implementation of `BaseSentimentClassifier` using Multinomial Naive Bayes for sentiment analysis, particularly effective for text classification tasks.

## Class Overview

### Inheritance
- Extends `BaseSentimentClassifier` abstract class
- Implements all required abstract methods
- Configures Naive Bayes-specific parameters

### Key Features
- Extremely fast training and prediction
- Naturally handles multi-class classification
- Works well with text frequency features
- Low memory footprint

In [15]:
class NaiveBayesClassifier(BaseSentimentClassifier):
    """Naive Bayes classifier for sentiment analysis"""
    
    def __init__(self):
        super().__init__("Naive Bayes")
    
    def _initialize_model(self) -> MultinomialNB:
        return MultinomialNB()
    
    def get_hyperparameters(self) -> Dict[str, Any]:
        return {
            'alpha': [0.1, 0.5, 1.0, 2.0],
            'fit_prior': [True, False]
        }

# Multi-layer Perceptron Sentiment Classifier

Implementation of `BaseSentimentClassifier` using a neural network (MLP) for sentiment analysis, capable of learning complex patterns in text data.

## Class Overview

### Inheritance
- Extends `BaseSentimentClassifier` abstract class
- Implements all required abstract methods
- Configures neural network-specific parameters

### Key Features
- Feedforward artificial neural network
- Automatic validation set for early stopping
- Multiple activation function options
- Adaptive learning rate capability

In [16]:
class MLPSentimentClassifier(BaseSentimentClassifier):
    """Multi-layer Perceptron classifier for sentiment analysis"""
    
    def __init__(self, random_state: int = 42):
        super().__init__("Neural Network (MLP)")
        self.random_state = random_state
    
    def _initialize_model(self) -> MLPClassifier:
        return MLPClassifier(
            random_state=self.random_state,
            max_iter=1000,
            early_stopping=True,
            validation_fraction=0.1
        )
    
    def get_hyperparameters(self) -> Dict[str, Any]:
        return {
            'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
            'activation': ['relu', 'tanh'],
            'alpha': [0.0001, 0.001, 0.01],
            'learning_rate': ['constant', 'adaptive']
        }

# AdaBoost Sentiment Classifier

Implementation of `BaseSentimentClassifier` using Adaptive Boosting (AdaBoost) for sentiment analysis, particularly effective for improving weak classifiers.

## Class Overview

### Inheritance
- Extends `BaseSentimentClassifier` abstract class
- Implements all required abstract methods
- Configures AdaBoost-specific parameters

### Key Features
- Sequential ensemble that corrects previous mistakes
- Can use various base estimators (default: Decision Stump)
- Handles both binary and multi-class classification
- Naturally addresses class imbalance

In [17]:
class AdaBoostSentimentClassifier(BaseSentimentClassifier):
    """AdaBoost classifier for sentiment analysis"""
    
    def __init__(self, random_state: int = 42):
        super().__init__("AdaBoost")
        self.random_state = random_state
    
    def _initialize_model(self) -> AdaBoostClassifier:
        return AdaBoostClassifier(
            random_state=self.random_state
        )
    
    def get_hyperparameters(self) -> Dict[str, Any]:
        return {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.5, 1.0, 1.5],
            'algorithm': ['SAMME', 'SAMME.R']
        }

# Model Manager

Centralized management system for handling multiple sentiment classification models, enabling streamlined training, evaluation, and comparison.

## Class Overview

### Core Responsibilities
- Unified interface for multiple classifier types
- Batch training and evaluation
- Model persistence management
- Performance comparison

### Key Features
- Supports all implemented classifier types
- Optional hyperparameter tuning
- Cross-validation during training
- Model serialization

In [18]:
class ModelManager:
    """Class to manage multiple classification models"""
    
    def __init__(self):
        self.models = {}
        self.results = {}
    
    def add_model(self, model: BaseSentimentClassifier):
        """Add a model to the manager"""
        self.models[model.model_name] = model
    
    def add_all_models(self, random_state: int = 42):
        """Add all available models"""
        models = [
            LogisticRegressionClassifier(random_state),
            DecisionTreeSentimentClassifier(random_state),
            RandomForestSentimentClassifier(random_state),
            XGBoostSentimentClassifier(random_state),
            GradientBoostingSentimentClassifier(random_state),
            KNNSentimentClassifier(),
            MLPSentimentClassifier(random_state),
            NaiveBayesClassifier(),
            AdaBoostSentimentClassifier(random_state)
        ]
        
        for model in models:
            self.add_model(model)
    
    def train_all_models(self, X_train: np.ndarray, y_train: np.ndarray, 
                        use_hyperparameter_tuning: bool = False):
        """Train all models"""
        print("Training all models...")
        
        for name, model in self.models.items():
            print(f"\nTraining {name}...")
            
            if use_hyperparameter_tuning:
                result = model.hyperparameter_tuning(X_train, y_train)
                print(f"Best parameters for {name}: {result['best_params']}")
                print(f"Best CV score: {result['best_score']:.4f}")
            else:
                model.fit(X_train, y_train)
                cv_result = model.cross_validate(X_train, y_train)
                print(f"CV score for {name}: {cv_result['mean_score']:.4f} (+/- {cv_result['std_score']:.4f})")
    
    def evaluate_all_models(self, evaluator: ModelEvaluator, X_test: np.ndarray, y_test: np.ndarray):
        """Evaluate all trained models"""
        print("\nEvaluating all models...")
        
        for name, model in self.models.items():
            if model.is_fitted:
                result = evaluator.evaluate_model(model, X_test, y_test, name)
                print(f"\n{name} - Test Accuracy: {result['accuracy']:.4f}")
    
    def get_best_model(self, evaluator: ModelEvaluator) -> Tuple[str, BaseSentimentClassifier]:
        """Get the best performing model"""
        if not evaluator.results:
            raise ValueError("No models have been evaluated yet")
        
        best_model_name = max(evaluator.results.keys(), 
                            key=lambda x: evaluator.results[x]['accuracy'])
        
        return best_model_name, self.models[best_model_name]
    
    def save_all_models(self, directory: str = "saved_models"):
        """Save all trained models"""
        if not os.path.exists(directory):
            os.makedirs(directory)
        
        for name, model in self.models.items():
            if model.is_fitted:
                filepath = os.path.join(directory, f"{name.replace(' ', '_').lower()}_model.pkl")
                model.save_model(filepath)
                print(f"Saved {name} to {filepath}")


if __name__ == "__main__":
    print("Classification models setup complete!")
    print("\nAvailable models:")
    
    # Create model manager and add all models
    manager = ModelManager()
    manager.add_all_models()
    
    for model_name in manager.models.keys():
        print(f"- {model_name}")
    
    print(f"\nTotal models: {len(manager.models)}")

Classification models setup complete!

Available models:
- Logistic Regression
- Decision Tree
- Random Forest
- XGBoost
- Gradient Boosting
- K-Nearest Neighbors
- Neural Network (MLP)
- Naive Bayes
- AdaBoost

Total models: 9


# Advanced Feature Engineering

## Class Overview

The `AdvancedFeatureEngineer` class provides sophisticated feature engineering techniques to enhance model performance by creating additional informative features from existing data.

### Key Capabilities
- Feature interactions (multiplicative combinations)
- Polynomial feature expansion
- Statistical feature generation
- Memory-efficient implementations


In [19]:
# Advanced feature engineering class
class AdvancedFeatureEngineer:
    """Advanced feature engineering techniques"""
    
    def __init__(self):
        self.feature_combinations = []
    
    def create_feature_interactions(self, X: np.ndarray, top_features: int = 100) -> np.ndarray:
        """Create feature interactions"""
        # Select top features based on variance
        feature_vars = np.var(X, axis=0)
        top_indices = np.argsort(feature_vars)[-top_features:]
        
        X_top = X[:, top_indices]
        
        # Create pairwise interactions
        interactions = []
        for i in range(X_top.shape[1]):
            for j in range(i+1, X_top.shape[1]):
                interaction = X_top[:, i] * X_top[:, j]
                interactions.append(interaction.reshape(-1, 1))
        
        if interactions:
            X_interactions = np.hstack(interactions)
            return np.hstack([X, X_interactions])
        
        return X
    
    def create_polynomial_features(self, X: np.ndarray, degree: int = 2, 
                                 max_features: int = 1000) -> np.ndarray:
        """Create polynomial features"""
        from sklearn.preprocessing import PolynomialFeatures
        
        # Limit features to avoid memory issues
        if X.shape[1] > max_features:
            X = X[:, :max_features]
        
        poly = PolynomialFeatures(degree=degree, include_bias=False)
        X_poly = poly.fit_transform(X)
        
        return X_poly
    
    def create_statistical_features(self, X: np.ndarray) -> np.ndarray:
        """Create statistical features"""
        # Row-wise statistics
        row_mean = np.mean(X, axis=1).reshape(-1, 1)
        row_std = np.std(X, axis=1).reshape(-1, 1)
        row_max = np.max(X, axis=1).reshape(-1, 1)
        row_min = np.min(X, axis=1).reshape(-1, 1)
        row_median = np.median(X, axis=1).reshape(-1, 1)
        
        statistical_features = np.hstack([
            row_mean, row_std, row_max, row_min, row_median
        ])
        
        return np.hstack([X, statistical_features])

In [20]:
# Usage
if __name__ == "__main__":
    print("Advanced feature engineering available:")
    print("- Feature interactions")
    print("- Polynomial features")
    print("- Statistical features")

Advanced feature engineering available:
- Feature interactions
- Polynomial features
- Statistical features


# Tweet Sentiment Analysis Pipeline

## Overview

A complete, configurable pipeline for training and evaluating machine learning models on tweet sentiment data. The pipeline handles:

1. Data loading and preprocessing
2. Feature extraction and engineering
3. Model training and evaluation
4. Results analysis and visualization
5. Model persistence and prediction

## Key Components

### Core Modules
| Component | Description |
|-----------|-------------|
| `BaseDataProcessor` | Handles data cleaning and label encoding |
| `FeatureExtractor` | Transforms text into numerical features |
| `DataSplitter` | Splits data into train/validation/test sets |
| `ModelManager` | Manages multiple classifier models |
| `ModelEvaluator` | Evaluates and compares model performance |

### Supported Models
1. Logistic Regression
2. Decision Tree
3. Random Forest
4. XGBoost
5. Gradient Boosting
6. K-Nearest Neighbors
7. Multilayer Perceptron (Neural Network)
8. Naive Bayes
9. AdaBoost

## Pipeline Workflow

```mermaid
graph TD
    A[Raw Data] --> B[Preprocessing]
    B --> C[Feature Extraction]
    C --> D[Train/Test Split]
    D --> E[Model Training]
    E --> F[Evaluation]
    F --> G[Results Analysis]
    G --> H[Prediction]

In [21]:
class TweetSentimentPipeline:
    """Complete pipeline for tweet sentiment analysis"""
    
    def __init__(self, data_path: str = None, data_df: pd.DataFrame = None):
        self.data_path = data_path
        self.data_df = data_df
        
        # Initialize components
        self.data_processor = BaseDataProcessor()
        self.feature_extractor = FeatureExtractor()
        self.data_splitter = DataSplitter()
        self.evaluator = ModelEvaluator()
        self.model_manager = ModelManager()
        self.feature_engineer = AdvancedFeatureEngineer()
        
        # Data storage
        self.df_clean = None
        self.X_train = None
        self.X_val = None
        self.X_test = None
        self.y_train = None
        self.y_val = None
        self.y_test = None
        self.label_mapping = None
        
        # Results
        self.results_summary = {}
    
    def load_and_preprocess_data(self, use_advanced_features: bool = False):
        """Load and preprocess the data"""
        print("Loading and preprocessing data...")
        
        # Load data
        if self.data_df is not None:
            df = self.data_df.copy()
        else:
            df = pd.read_csv(self.data_path)
        
        print(f"Original data shape: {df.shape}")
        print(f"Columns: {df.columns.tolist()}")
        
        # Basic preprocessing
        self.df_clean = self.data_processor.preprocess_data(df)
        print(f"Clean data shape: {self.df_clean.shape}")
        
        # Display sentiment distribution
        print("\nSentiment distribution:")
        print(self.df_clean['sentiment'].value_counts())
        
        # Encode labels
        y = self.data_processor.encode_labels(self.df_clean['sentiment'])
        self.label_mapping = self.data_processor.get_label_mapping()
        print(f"\nLabel mapping: {self.label_mapping}")
        
        # Extract features
        print("\nExtracting features...")
        X_train_base, X_test_base = self.feature_extractor.fit_transform_features(
            self.df_clean['processed_tweet']
        )
        
        if X_test_base is None:
            X_features = X_train_base
        else:
            X_features = np.vstack([X_train_base, X_test_base])
        
        print(f"Feature matrix shape: {X_features.shape}")
        
        # Advanced feature engineering
        if use_advanced_features:
            print("Applying advanced feature engineering...")
            
            # Create feature interactions
            X_features = self.feature_engineer.create_feature_interactions(X_features)
            print(f"After interactions: {X_features.shape}")
            
            # Create statistical features
            X_features = self.feature_engineer.create_statistical_features(X_features)
            print(f"After statistical features: {X_features.shape}")
        
        # Split data
        print("\nSplitting data...")
        self.X_train, self.X_val, self.X_test, self.y_train, self.y_val, self.y_test = \
            self.data_splitter.split_data(X_features, y)
        
        print(f"Train set: {self.X_train.shape}")
        print(f"Validation set: {self.X_val.shape}")
        print(f"Test set: {self.X_test.shape}")
        
        return self
    
    def train_individual_models(self, use_hyperparameter_tuning: bool = False):
        """Train individual classification models"""
        print("\n" + "="*50)
        print("TRAINING INDIVIDUAL MODELS")
        print("="*50)
        
        # Add all models to manager
        self.model_manager.add_all_models()
        
        # Train all models
        self.model_manager.train_all_models(
            self.X_train, self.y_train, 
            use_hyperparameter_tuning=use_hyperparameter_tuning
        )
        
        # Evaluate all models
        self.model_manager.evaluate_all_models(
            self.evaluator, self.X_test, self.y_test
        )
        
        return self
    
    def analyze_results(self):
        """Analyze and visualize results"""
        print("\n" + "="*50)
        print("RESULTS ANALYSIS")
        print("="*50)
        
        # Get comparison of all models
        comparison = self.evaluator.compare_models()
        print("\nModel Performance Comparison:")
        print(comparison)
        
        # Plot comparison
        self.evaluator.plot_model_comparison()
        
        # Get best model
        best_model_name, best_model = self.model_manager.get_best_model(self.evaluator)
        print(f"\nBest Model: {best_model_name}")
        print(f"Best Accuracy: {self.evaluator.results[best_model_name]['accuracy']:.4f}")
        
        # Detailed analysis of best model
        print(f"\nDetailed Classification Report for {best_model_name}:")
        print(self.evaluator.results[best_model_name]['classification_report'])
        
        # Plot confusion matrix for best model
        label_names = [self.label_mapping[i] for i in sorted(self.label_mapping.keys())]
        self.evaluator.plot_confusion_matrix(best_model_name, label_names)
        
        # Feature importance analysis
        try:
            feature_importance = self.feature_extractor.get_feature_importance(best_model, top_n=15)
            print(f"\nTop 15 Features for {best_model_name}:")
            print(feature_importance)
            
            # Plot feature importance
            plt.figure(figsize=(10, 8))
            plt.barh(range(len(feature_importance)), feature_importance['importance'])
            plt.yticks(range(len(feature_importance)), feature_importance['feature'])
            plt.xlabel('Importance')
            plt.title(f'Top Features - {best_model_name}')
            plt.tight_layout()
            plt.show()
            
        except Exception as e:
            print(f"Could not analyze feature importance: {str(e)}")
        
        # Store results summary
        self.results_summary = {
            'best_model': best_model_name,
            'best_accuracy': self.evaluator.results[best_model_name]['accuracy'],
            'all_results': comparison,
            'label_mapping': self.label_mapping
        }
        
        return self
    
    def predict_new_samples(self, texts: List[str], model_name: str = None):
        """Predict sentiment for new text samples"""
        if model_name is None:
            model_name = self.results_summary['best_model']
        
        # Get the model
        if model_name in self.model_manager.models:
            model = self.model_manager.models[model_name]
        else:
            raise ValueError(f"Model {model_name} not found")
        
        # Preprocess texts (basic preprocessing)
        processed_texts = [text.lower().strip() for text in texts]
        
        # Transform features
        X_new = self.feature_extractor.vectorizer.transform(processed_texts).toarray()
        
        # Make predictions
        predictions = model.predict(X_new)
        probabilities = model.predict_proba(X_new)
        
        # Convert predictions to labels
        predicted_labels = [self.label_mapping[pred] for pred in predictions]
        
        # Create results dataframe
        results_df = pd.DataFrame({
            'text': texts,
            'predicted_sentiment': predicted_labels,
            'confidence': np.max(probabilities, axis=1)
        })
        
        return results_df
    
    def save_models(self, directory: str = "saved_models"):
        """Save all trained models"""
        print(f"\nSaving models to {directory}...")
        self.model_manager.save_all_models(directory)
        
        # Save feature extractor
        import joblib
        joblib.dump(self.feature_extractor, f"{directory}/feature_extractor.pkl")
        joblib.dump(self.data_processor, f"{directory}/data_processor.pkl")
        
        print("All models saved successfully!")
    
    def generate_report(self):
        """Generate comprehensive report"""
        print("\n" + "="*60)
        print("COMPREHENSIVE SENTIMENT ANALYSIS REPORT")
        print("="*60)
        
        print(f"\nDataset Information:")
        print(f"- Total samples: {len(self.df_clean)}")
        print(f"- Features: {self.X_train.shape[1]}")
        print(f"- Classes: {len(self.label_mapping)}")
        
        print(f"\nClass Distribution:")
        for label, count in self.df_clean['sentiment'].value_counts().items():
            print(f"- {label}: {count} ({count/len(self.df_clean)*100:.1f}%)")
        
        print(f"\nModel Performance Summary:")
        comparison = self.evaluator.compare_models()
        for _, row in comparison.iterrows():
            print(f"- {row['Model']}: {row['Accuracy']:.4f}")
        
        print(f"\nBest Model: {self.results_summary['best_model']}")
        print(f"Best Accuracy: {self.results_summary['best_accuracy']:.4f}")
        
        print(f"\nRecommendations:")
        if self.results_summary['best_accuracy'] > 0.85:
            print("- Excellent performance! Model is ready for production.")
        elif self.results_summary['best_accuracy'] > 0.75:
            print("- Good performance. Consider more data or feature engineering.")
        else:
            print("- Performance needs improvement. Consider:")
            print("  * More training data")
            print("  * Better feature engineering")
            print("  * Different preprocessing approaches")
            print("  * Advanced deep learning models")

def run_complete_pipeline(data_df: pd.DataFrame, use_advanced_features: bool = False,
                         use_hyperparameter_tuning: bool = False):
    """Run the complete sentiment analysis pipeline."""
    
    pipeline = TweetSentimentPipeline(data_df=data_df)
    
    # Step 1: Preprocess data and extract features
    pipeline.load_and_preprocess_data(use_advanced_features=use_advanced_features)
    
    # Step 2: Train individual models
    pipeline.train_individual_models(use_hyperparameter_tuning=use_hyperparameter_tuning)
    
    # Step 3: Analyze model results and visualize
    pipeline.analyze_results()
    
    # Step 4 (Optional): Save trained models
    pipeline.save_models()
    
    # Step 5 (Optional): Generate a report
    pipeline.generate_report()
    
    return pipeline

In [22]:
# Run the minimal pipeline
# sample_df = cleaned_tweets_df.sample(frac=0.3, random_state=42)
# pipeline = run_minimal_pipeline(data_df=sample_df)

In [None]:
# Complete Sentiment Analysis Pipeline Execution

# Pipeline Initialization

pipeline = run_complete_pipeline(data_df=cleaned_tweets_df,
                                 use_advanced_features=True,
                                 use_hyperparameter_tuning=True)

Loading and preprocessing data...
Original data shape: (9070, 5)
Columns: ['tweet', 'product', 'tokens', 'processed_tweet', 'sentiment']
Clean data shape: (9068, 5)

Sentiment distribution:
No emotion toward brand or product    5373
Positive emotion                      2970
Negative emotion                       569
I can't tell                           156
Name: sentiment, dtype: int64

Label mapping: {0: "I can't tell", 1: 'Negative emotion', 2: 'No emotion toward brand or product', 3: 'Positive emotion'}

Extracting features...
Feature matrix shape: (9068, 1000)
Applying advanced feature engineering...
After interactions: (9068, 5950)
After statistical features: (9068, 5955)

Splitting data...
Train set: (5440, 5955)
Validation set: (1814, 5955)
Test set: (1814, 5955)

TRAINING INDIVIDUAL MODELS
Training all models...

Training Logistic Regression...
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


In [None]:
import os
import joblib
from sklearn.base import BaseEstimator
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score

In [None]:
class ModelEvaluatorF1:
    """
    Enhanced model evaluator with F1 score optimization for sentiment analysis.
    
    Features:
    - Primary focus on macro-averaged F1 score
    - Detailed per-class metrics
    - Comprehensive visualization tools
    - Model comparison functionality
    """
    
    def __init__(self):
        """Initialize with empty results storage"""
        self.results = {}
    
    def evaluate_model(self, model, X_test: np.ndarray, y_test: np.ndarray, 
                      model_name: str) -> Dict[str, Any]:
        """
        Evaluate model performance with F1 focus.
        
        Args:
            model: Trained classifier
            X_test: Test features
            y_test: True labels
            model_name: Identifier for the model
            
        Returns:
            Dictionary containing:
            - f1_macro: Macro-averaged F1 score
            - f1_weighted: Weighted F1 score
            - accuracy: Overall accuracy
            - class_report: Full classification report
            - cm: Confusion matrix
            - predictions: Model predictions
        """
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        f1_macro = f1_score(y_test, y_pred, average='macro')
        f1_weighted = f1_score(y_test, y_pred, average='weighted')
        accuracy = accuracy_score(y_test, y_pred)
        class_report = classification_report(y_test, y_pred, output_dict=True)
        cm = confusion_matrix(y_test, y_pred)
        
        # Store results
        results = {
            'f1_macro': f1_macro,
            'f1_weighted': f1_weighted,
            'accuracy': accuracy,
            'class_report': class_report,
            'cm': cm,
            'predictions': y_pred
        }
        
        self.results[model_name] = results
        return results
    
    def plot_confusion_matrix(self, model_name: str, class_names: List[str] = None,
                            figsize: Tuple[int, int] = (8, 6)):
        """
        Visualize confusion matrix with enhanced formatting.
        
        Args:
            model_name: Name of model to visualize
            class_names: List of class names for labeling
            figsize: Figure dimensions
        """
        if model_name not in self.results:
            raise ValueError(f"Model {model_name} not found in results")
        
        cm = self.results[model_name]['cm']
        
        plt.figure(figsize=figsize)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=class_names, 
                   yticklabels=class_names,
                   cbar=False)
        plt.title(f'Confusion Matrix - {model_name}', pad=20)
        plt.xlabel('Predicted Label', labelpad=15)
        plt.ylabel('True Label', labelpad=15)
        plt.xticks(rotation=45 if len(class_names) > 3 else 0)
        plt.yticks(rotation=0)
        plt.tight_layout()
        plt.show()
    
    def compare_models(self, metric: str = 'f1_macro') -> pd.DataFrame:
        """
        Compare model performance on specified metric.
        
        Args:
            metric: Metric to compare ('f1_macro', 'f1_weighted', 'accuracy')
            
        Returns:
            DataFrame sorted by metric score
        """
        valid_metrics = ['f1_macro', 'f1_weighted', 'accuracy']
        if metric not in valid_metrics:
            raise ValueError(f"Metric must be one of {valid_metrics}")
            
        comparison = pd.DataFrame([
            {
                'Model': name,
                'F1 Macro': results['f1_macro'],
                'F1 Weighted': results['f1_weighted'],
                'Accuracy': results['accuracy']
            }
            for name, results in self.results.items()
        ]).sort_values(metric, ascending=False)
        
        return comparison
    
    def plot_metric_comparison(self, metric: str = 'f1_macro', 
                             figsize: Tuple[int, int] = (10, 6)):
        """
        Visualize model comparison for specified metric.
        
        Args:
            metric: Metric to visualize
            figsize: Figure dimensions
        """
        comparison = self.compare_models(metric)
        
        plt.figure(figsize=figsize)
        bars = plt.bar(comparison['Model'], comparison[metric], 
                      color=plt.cm.Blues(np.linspace(0.4, 1, len(comparison))))
        
        plt.title(f'Model Comparison - {metric.replace("_", " ").title()}', pad=20)
        plt.xlabel('Model', labelpad=15)
        plt.ylabel(metric.replace("_", " ").title(), labelpad=15)
        plt.xticks(rotation=45)
        plt.grid(axis='y', alpha=0.3)
        
        # Add value labels
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:.3f}', 
                    ha='center', va='bottom')
        
        plt.tight_layout()
        plt.show()
    
    def get_best_model(self, metric: str = 'f1_macro') -> Tuple[str, Dict]:
        """
        Identify best performing model based on specified metric.
        
        Args:
            metric: Metric to optimize for
            
        Returns:
            Tuple of (model_name, results_dict)
        """
        if not self.results:
            raise ValueError("No models have been evaluated yet")
            
        best_model = max(self.results.items(), 
                        key=lambda x: x[1][metric])
        return best_model
    
    def generate_report(self, model_name: str = None) -> Dict:
        """
        Generate comprehensive evaluation report.
        
        Args:
            model_name: Specific model to report on (None for all)
            
        Returns:
            Dictionary containing:
            - model_name: Performance metrics
            - best_model: Best performing model info
            - comparison: All models comparison
        """
        if model_name:
            if model_name not in self.results:
                raise ValueError(f"Model {model_name} not found")
            return {model_name: self.results[model_name]}
        
        best_name, best_results = self.get_best_model()
        return {
            'best_model': {
                'name': best_name,
                'f1_macro': best_results['f1_macro'],
                'accuracy': best_results['accuracy']
            },
            'comparison': self.compare_models().to_dict('records'),
            'all_results': self.results
        }

In [None]:
class ModelManagerF1:
    """
    Enhanced model manager focused on F1 score optimization for sentiment analysis.
    
    Features:
    - F1-centric model training and evaluation
    - Comprehensive model tracking
    - Hyperparameter tuning with F1 optimization
    - Model persistence
    """
    
    def __init__(self):
        """Initialize model and results storage"""
        self.models: Dict[str, BaseEstimator] = {}
        self.training_results: Dict[str, Any] = {}
    
    def add_model(self, model: BaseEstimator) -> None:
        """
        Add a model to the manager.
        
        Args:
            model: Initialized model object implementing scikit-learn interface
        """
        if not hasattr(model, 'model_name'):
            raise ValueError("Model must have 'model_name' attribute")
        self.models[model.model_name] = model
    
    def add_all_models(self, random_state: int = 42) -> None:
        """
        Add all available models with F1-optimized defaults.
        
        Args:
            random_state: Random seed for reproducibility
        """
        models = [
            NaiveBayesClassifier(),
            LogisticRegressionClassifier(random_state, class_weight='balanced'),
            RandomForestSentimentClassifier(random_state, class_weight='balanced_subsample')
        ]
        
        for model in models:
            self.add_model(model)
    
    def train_all_models(self, 
                        X_train: np.ndarray, 
                        y_train: np.ndarray,
                        use_hyperparameter_tuning: bool = False,
                        scoring: str = 'f1_macro',
                        cv: int = 5) -> None:
        """
        Train all models with F1 optimization focus.
        
        Args:
            X_train: Training features
            y_train: Training labels
            use_hyperparameter_tuning: Whether to perform grid search
            scoring: Metric to optimize (default: 'f1_macro')
            cv: Cross-validation folds
        """
        print(f"\nTraining models with {scoring} optimization...")
        
        for name, model in self.models.items():
            print(f"\n=== Training {name} ===")
            
            if use_hyperparameter_tuning:
                result = model.hyperparameter_tuning(
                    X_train, y_train,
                    scoring=scoring,
                    cv=cv
                )
                self.training_results[name] = result
                print(f"Best params: {result['best_params']}")
                print(f"Best {scoring}: {result['best_score']:.4f}")
            else:
                model.fit(X_train, y_train)
                cv_scores = cross_val_score(
                    model, X_train, y_train,
                    scoring=scoring,
                    cv=cv
                )
                self.training_results[name] = {
                    'mean_score': np.mean(cv_scores),
                    'std_score': np.std(cv_scores)
                }
                print(f"CV {scoring}: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")
    
    def evaluate_all_models(self,
                          evaluator: ModelEvaluatorF1,
                          X_test: np.ndarray,
                          y_test: np.ndarray,
                          metric: str = 'f1_macro') -> None:
        """
        Evaluate all trained models using F1-focused evaluator.
        
        Args:
            evaluator: Initialized ModelEvaluatorF1 instance
            X_test: Test features
            y_test: Test labels
            metric: Primary metric to highlight
        """
        print(f"\nEvaluating models on {metric}...")
        
        for name, model in self.models.items():
            if hasattr(model, 'is_fitted') and model.is_fitted:
                result = evaluator.evaluate_model(model, X_test, y_test, name)
                print(f"{name}:")
                print(f"- {metric}: {result[metric]:.4f}")
                print(f"- Accuracy: {result['accuracy']:.4f}")
    
    def get_best_model(self,
                      evaluator: ModelEvaluatorF1,
                      metric: str = 'f1_macro'):
        """
        Get the best performing model based on specified metric.
        
        Args:
            evaluator: ModelEvaluatorF1 with evaluation results
            metric: Metric to optimize for
            
        Returns:
            Tuple of (model_name, model_instance)
        """ 
        best_name = max(evaluator.results.items(),
                       key=lambda x: x[1][metric])[0]
        return best_name, self.models[best_name]
    
    def save_all_models(self,
                       directory: str = "saved_models",
                       save_format: str = 'joblib') -> None:
        """
        Save all trained models to disk.
        
        Args:
            directory: Output directory path
            save_format: Either 'joblib' or 'pickle'
        """
        os.makedirs(directory, exist_ok=True)
        
        for name, model in self.models.items():
            if hasattr(model, 'is_fitted') and model.is_fitted:
                filename = f"{name.lower().replace(' ', '_')}.{save_format}"
                filepath = os.path.join(directory, filename)
                
                if save_format == 'joblib':
                    import joblib
                    joblib.dump(model, filepath)
                else:
                    import pickle
                    with open(filepath, 'wb') as f:
                        pickle.dump(model, f)
                
                print(f"Saved {name} to {filepath}")

In [None]:
class TweetSentimentPipelineF1:
    """
    Complete F1-optimized pipeline for tweet sentiment analysis.
    
    Features:
    - F1 score as primary optimization metric
    - Comprehensive model training and evaluation
    - Advanced feature engineering
    - Model persistence
    - Detailed reporting
    """
    
    def __init__(self, data_path: str = None, data_df: pd.DataFrame = None):
        """
        Initialize pipeline components.
        
        Args:
            data_path: Path to raw data file
            data_df: Preloaded DataFrame (alternative to data_path)
        """
        self.data_path = data_path
        self.data_df = data_df
        
        # Initialize components
        self.data_processor = BaseDataProcessor()
        self.feature_extractor = FeatureExtractor()
        self.data_splitter = DataSplitter()
        self.evaluator = ModelEvaluatorF1()
        self.model_manager = ModelManagerF1()
        self.feature_engineer = AdvancedFeatureEngineer()
        
        # Configuration
        self.scoring_metric = 'f1_macro'  # Primary optimization metric
        
        # Data storage
        self.df_clean = None
        self.X_train = None
        self.X_val = None
        self.X_test = None
        self.y_train = None
        self.y_val = None
        self.y_test = None
        self.label_mapping = None
        
        # Results
        self.results_summary = {}
    
    def load_and_preprocess_data(self, use_advanced_features: bool = False) -> 'TweetSentimentPipelineF1':
        """
        Load and preprocess the data with optional advanced features.
        
        Args:
            use_advanced_features: Whether to generate additional features
            
        Returns:
            self for method chaining
        """
        print("Loading and preprocessing data...")
        
        # Load data
        df = self.data_df.copy() if self.data_df is not None else pd.read_csv(self.data_path)
        print(f"Original data shape: {df.shape}")
        
        # Basic preprocessing
        self.df_clean = self.data_processor.preprocess_data(df)
        print(f"Clean data shape: {self.df_clean.shape}")
        print("\nSentiment distribution:")
        print(self.df_clean['sentiment'].value_counts())
        
        # Encode labels
        y = self.data_processor.encode_labels(self.df_clean['sentiment'])
        self.label_mapping = self.data_processor.get_label_mapping()
        print(f"\nLabel mapping: {self.label_mapping}")
        
        # Extract features
        print("\nExtracting features...")
        X_train_base, X_test_base = self.feature_extractor.fit_transform_features(
            self.df_clean['processed_tweet']
        )
        X_features = X_train_base if X_test_base is None else np.vstack([X_train_base, X_test_base])
        print(f"Feature matrix shape: {X_features.shape}")
        
        # Advanced feature engineering
        if use_advanced_features:
            print("Applying advanced feature engineering...")
            X_features = self.feature_engineer.create_feature_interactions(X_features)
            X_features = self.feature_engineer.create_statistical_features(X_features)
            print(f"Final feature shape: {X_features.shape}")
        
        # Split data
        print("\nSplitting data...")
        (self.X_train, self.X_val, self.X_test, 
         self.y_train, self.y_val, self.y_test) = self.data_splitter.split_data(X_features, y)
        
        print(f"Train: {self.X_train.shape}, Val: {self.X_val.shape}, Test: {self.X_test.shape}")
        return self
    
    def train_individual_models(self, use_hyperparameter_tuning: bool = False) -> 'TweetSentimentPipelineF1':
        """
        Train all models with F1 optimization.
        
        Args:
            use_hyperparameter_tuning: Whether to perform grid search
            
        Returns:
            self for method chaining
        """
        print("\n" + "="*50)
        print("TRAINING MODELS (F1-OPTIMIZED)")
        print("="*50)
        
        self.model_manager.add_all_models()
        self.model_manager.train_all_models(
            self.X_train, self.y_train,
            use_hyperparameter_tuning=use_hyperparameter_tuning,
            scoring=self.scoring_metric
        )
        
        self.model_manager.evaluate_all_models(
            self.evaluator, self.X_test, self.y_test
        )
        return self
    
    def analyze_results(self) -> 'TweetSentimentPipelineF1':
        """
        Analyze and visualize results with F1 focus.
        
        Returns:
            self for method chaining
        """
        print("\n" + "="*50)
        print("F1-CENTRIC RESULTS ANALYSIS")
        print("="*50)
        
        # F1-based comparison
        comparison = self.evaluator.compare_models(metric='f1_macro')
        print("\nModel Performance Comparison (F1 Scores):")
        print(comparison)
        
        # Visualization
        self.evaluator.plot_metric_comparison(metric='f1_macro')
        
        # Best model analysis
        best_name, best_model = self.model_manager.get_best_model(
            self.evaluator, metric='f1_macro'
        )
        print(f"\nBest Model (F1): {best_name}")
        print(f"Best F1 Score: {self.evaluator.results[best_name]['f1_macro']:.4f}")
        
        # Detailed metrics
        print(f"\nClassification Report for {best_name}:")
        print(self.evaluator.results[best_name]['classification_report'])
        
        # Confusion matrix
        label_names = [self.label_mapping[i] for i in sorted(self.label_mapping.keys())]
        self.evaluator.plot_confusion_matrix(best_name, label_names)
        
        # Feature importance
        try:
            feat_imp = self.feature_extractor.get_feature_importance(best_model, top_n=15)
            print(f"\nTop 15 Features for {best_name}:")
            print(feat_imp)
            
            plt.figure(figsize=(10, 6))
            sns.barplot(x='importance', y='feature', data=feat_imp)
            plt.title(f'Feature Importance - {best_name}')
            plt.tight_layout()
            plt.show()
        except Exception as e:
            print(f"\nFeature importance unavailable: {str(e)}")
        
        # Store summary
        self.results_summary = {
            'best_model': best_name,
            'best_f1': self.evaluator.results[best_name]['f1_macro'],
            'comparison': comparison,
            'label_mapping': self.label_mapping
        }
        return self
    
    def predict_new_samples(self, texts: List[str], model_name: str = None) -> pd.DataFrame:
        """
        Predict sentiment for new text samples.
        
        Args:
            texts: List of raw text strings
            model_name: Specific model to use (None for best model)
            
        Returns:
            DataFrame with predictions and confidence scores
        """
        if not hasattr(self, 'results_summary'):
            raise ValueError("Pipeline not trained - run training first")
            
        model_name = model_name or self.results_summary['best_model']
        model = self.model_manager.models.get(model_name)
        
        if model is None:
            raise ValueError(f"Model {model_name} not found")
        
        # Preprocess and predict
        processed_texts = [text.lower().strip() for text in texts]
        X_new = self.feature_extractor.vectorizer.transform(processed_texts)
        predictions = model.predict(X_new)
        probabilities = model.predict_proba(X_new)
        
        return pd.DataFrame({
            'text': texts,
            'predicted_sentiment': [self.label_mapping[p] for p in predictions],
            'confidence': np.max(probabilities, axis=1)
        })
    
    def save_models(self, directory: str = "saved_models") -> None:
        """Save all trained models and components to disk"""
        os.makedirs(directory, exist_ok=True)
        print(f"\nSaving models to {directory}...")
        
        self.model_manager.save_all_models(directory)
        joblib.dump(self.feature_extractor, f"{directory}/feature_extractor.pkl")
        joblib.dump(self.data_processor, f"{directory}/data_processor.pkl")
        
        print("All components saved successfully!")
    
    def generate_report(self) -> Dict[str, Any]:
        """Generate comprehensive performance report"""
        report = {
            'dataset_info': {
                'samples': len(self.df_clean),
                'features': self.X_train.shape[1],
                'classes': len(self.label_mapping),
                'class_distribution': self.df_clean['sentiment'].value_counts().to_dict()
            },
            'best_model': self.results_summary['best_model'],
            'best_f1': self.results_summary['best_f1'],
            'model_comparison': self.evaluator.compare_models(metric='f1_macro').to_dict(),
            'recommendations': self._generate_recommendations()
        }
        
        print("\n" + "="*60)
        print("COMPREHENSIVE PERFORMANCE REPORT")
        print("="*60)
        print(f"\nBest Model: {report['best_model']}")
        print(f"Best F1 Score: {report['best_f1']:.4f}")
        print("\nRecommendations:")
        for rec in report['recommendations']:
            print(f"- {rec}")
            
        return report
    
    def _generate_recommendations(self) -> List[str]:
        """Generate actionable recommendations based on performance"""
        f1_score = self.results_summary['best_f1']
        
        if f1_score > 0.85:
            return [
                "Excellent performance - ready for production",
                "Consider monitoring for concept drift"
            ]
        elif f1_score > 0.75:
            return [
                "Good performance - suitable for deployment",
                "Try additional feature engineering",
                "Consider collecting more diverse training data"
            ]
        else:
            return [
                "Performance needs improvement",
                "Investigate class imbalance issues",
                "Try different model architectures",
                "Review data quality and preprocessing",
                "Consider ensemble methods"
            ]


def run_complete_pipeline_f1(
    data_df: pd.DataFrame,
    use_advanced_features: bool = False,
    use_hyperparameter_tuning: bool = False,
    save_models: bool = True
) -> TweetSentimentPipelineF1:
    """
    Execute complete F1-optimized sentiment analysis pipeline.
    
    Args:
        data_df: Input DataFrame containing tweets
        use_advanced_features: Whether to generate additional features
        use_hyperparameter_tuning: Whether to perform grid search
        save_models: Whether to persist trained models
        
    Returns:
        Configured pipeline instance with results
    """
    pipeline = TweetSentimentPipelineF1(data_df=data_df)
    
    try:
        # Data preparation
        pipeline.load_and_preprocess_data(use_advanced_features=use_advanced_features)
        
        # Model training
        pipeline.train_individual_models(
            use_hyperparameter_tuning=use_hyperparameter_tuning
        )
        
        # Analysis
        pipeline.analyze_results()
        
        # Optional saving
        if save_models:
            pipeline.save_models()
        
        # Final report
        pipeline.generate_report()
        
    except Exception as e:
        print(f"Pipeline failed: {str(e)}")
        raise
    
    return pipeline

In [None]:
# Complete Sentiment Analysis Pipeline Execution focusing on F1 Score

pipeline_f1 = run_complete_pipeline_f1(data_df=cleaned_tweets_df,
                                 use_advanced_features=True,
                                 use_hyperparameter_tuning=True)