# NLP Email Summerizer using Neural Networks 

## Implementaton 

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
    train_test_split, 
    cross_val_score, 
    KFold, 
)
from sklearn.preprocessing import (
    StandardScaler, 
    LabelEncoder
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import (
    mean_absolute_error, 
    mean_squared_error, 
    r2_score
)
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

class EmailSummarizationModel:
    def __init__(self, config=None):
        """
        Initialize the email summarization model with comprehensive configurations
        
        Args:
            config (dict): Configuration parameters for model initialization
        """
        self.config = config or {
            'model_type': 'hybrid',
            'feature_dim': 100,
            'regularization_strength': 1.0,
            'learning_rate': 0.001
        }
        
        # Pre-processing components
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=5000, 
            stop_words='english'
        )
        self.dimensionality_reducer = TruncatedSVD(
            n_components=self.config['feature_dim']
        )
        
        # Model components
        self.models = {
            'random_forest': RandomForestRegressor(
                n_estimators=100, 
                random_state=42
            ),
            'gradient_boosting': GradientBoostingRegressor(
                n_estimators=100, 
                random_state=42
            ),
            'ridge_regression': Ridge(
                alpha=self.config['regularization_strength']
            )
        }
        
        # Advanced pre-trained transformer
        self.tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-cnn')
        self.transformer_model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-large-cnn')
        
    def preprocess_data(self, emails, labels=None):
        """
        Comprehensive data preprocessing pipeline
        
        Args:
            emails (list): Raw email texts
            labels (list, optional): Corresponding summary labels
        
        Returns:
            tuple: Processed features and optional labels
        """
        # TF-IDF Vectorization
        tfidf_features = self.tfidf_vectorizer.fit_transform(emails)
        
        # Dimensionality Reduction
        reduced_features = self.dimensionality_reducer.fit_transform(tfidf_features)
        
        # Standardization
        processed_features = self.scaler.fit_transform(reduced_features)
        
        if labels is not None:
            processed_labels = self.label_encoder.fit_transform(labels)
            return processed_features, processed_labels
        
        return processed_features
    
    def train_with_cross_validation(self, X, y, n_splits=5):
        """
        Advanced cross-validation training strategy
        
        Args:
            X (array): Input features
            y (array): Target labels
            n_splits (int): Number of cross-validation splits
        
        Returns:
            dict: Performance metrics for each model
        """
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        performance_metrics = {}
        
        for model_name, model in self.models.items():
            cv_scores = cross_val_score(
                model, 
                X, 
                y, 
                cv=kf, 
                scoring='neg_mean_squared_error'
            )
            performance_metrics[model_name] = {
                'mean_cv_score': -cv_scores.mean(),
                'std_cv_score': cv_scores.std()
            }
        
        return performance_metrics
    
    def ablation_study(self, X, y):
        """
        Comprehensive ablation study to understand model components
        
        Args:
            X (array): Input features
            y (array): Target labels
        
        Returns:
            dict: Performance variations across different configurations
        """
        ablation_results = {}
        
        # Configuration variations
        configurations = [
            {'feature_dim': 50},
            {'feature_dim': 100},
            {'feature_dim': 150},
            {'feature_dim': 200},
            {'regularization_strength': 0.01},
            {'regularization_strength': 0.1},
            {'regularization_strength': 1.0},
            {'regularization_strength': 10.0},
            {'tfidf_max_features': 3000},
            {'tfidf_max_features': 7000}
        ]
        
        # Original config backup
        original_config = self.config.copy()
        
        for config in configurations:
            # Update configuration
            self.config.update(config)
            
            # Adjust the TfidfVectorizer if needed
            if 'tfidf_max_features' in config:
                self.tfidf_vectorizer = TfidfVectorizer(
                    max_features=config['tfidf_max_features'],
                    stop_words='english'
                )
            else:
                self.tfidf_vectorizer = TfidfVectorizer(
                    max_features=self.config.get('tfidf_max_features', 5000),
                    stop_words='english'
                )
            
            # Re-initialize model with new configuration
            self.__init__(self.config)
            processed_features, processed_labels = self.preprocess_data(X, y)
            
            # Train and evaluate
            cv_results = self.train_with_cross_validation(
                processed_features, 
                processed_labels
            )
            
            ablation_results[str(config)] = cv_results
        
        # Restore the original configuration
        self.config = original_config
        self.__init__(self.config)
        
        return ablation_results
    
    def extreme_error_analysis(self, X, y_true, y_pred):
        """
        Analyze extreme error cases in predictions
        
        Args:
            X (array): Input features
            y_true (array): True labels
            y_pred (array): Predicted labels
        
        Returns:
            dict: Extreme error insights
        """
        errors = np.abs(y_true - y_pred)
        error_threshold = np.percentile(errors, 95)
        
        extreme_error_indices = np.where(errors >= error_threshold)[0]
        
        return {
            'extreme_error_indices': extreme_error_indices,
            'max_error': errors.max(),
            'min_error': errors.min(),
            'mean_error': errors.mean(),
            'median_error': np.median(errors)
        }
    
    def bias_variance_decomposition(self, X, y):
        """
        Perform bias-variance decomposition analysis
        
        Args:
            X (array): Input features
            y (array): Target labels
        
        Returns:
            dict: Bias-variance decomposition metrics
        """
        # Split data into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        bias_variance_results = {}
        
        for model_name, model in self.models.items():
            model.fit(X_train, y_train)
            
            # Training performance
            train_predictions = model.predict(X_train)
            train_error = mean_squared_error(y_train, train_predictions)
            
            # Validation performance
            val_predictions = model.predict(X_val)
            val_error = mean_squared_error(y_val, val_predictions)
            
            bias_variance_results[model_name] = {
                'train_error': train_error,
                'validation_error': val_error,
                'difference': abs(train_error - val_error)
            }
        
        return bias_variance_results
    
    def visualize_results(self, results, output_path='results_visualization.png'):
        """
        Create visualization for model performance and insights
        
        Args:
            results (dict): Performance and analysis results
            output_path (str): Path to save visualization
        """
        plt.figure(figsize=(15, 10))
        plt.title('Model Performance Visualization')
        
        # Placeholder for actual visualization logic
        plt.tight_layout()
        plt.savefig(output_path)
        plt.close()
        
def main():
    # email tects
    email_summarization_tool = EmailSummarizationModel()
    
    # email texts
    emails = [
        """Subject: Project Kickoff Meeting

Hi Team,

I hope this message finds you well. I’d like to schedule a project kickoff meeting for the new client initiative next Monday at 9 AM in Conference Room A. Please come prepared with your initial thoughts and any questions you might have.

Looking forward to collaborating with all of you.

Best regards,
Jessica""",
        
        """Subject: Request for Budget Approval

Dear Finance Team,

Attached is the budget proposal for the upcoming marketing campaign. Could you please review and approve it by the end of this week? Let me know if you need any additional information.

Thank you,
Mark""",
        
        """Subject: Monthly Sales Report

Hello All,

Please find attached the sales report for June. We've seen a 15% increase in revenue compared to last month, largely driven by the new product launch. Let’s discuss the strategies that worked well in our next meeting.

Best,
Emily""",
        
        """Subject: IT Maintenance Downtime

Dear Employees,

Please be advised that our IT department will perform scheduled maintenance on the servers this Saturday from 10 PM to 4 AM. During this time, access to email and internal systems will be unavailable. We apologize for any inconvenience this may cause.

Thank you for your understanding,
IT Support Team""",
        
        """Subject: Invitation to Annual Company Retreat

Hi Everyone,

We are excited to announce our Annual Company Retreat will take place from August 5th to August 7th at Lakeview Resort. It’s a great opportunity to relax, network, and engage in team-building activities. Please RSVP by July 15th.

Looking forward to seeing you there!

Cheers,
Amanda""",
        
        """Subject: Feedback on Presentation

Hi Daniel,

Great job on the presentation yesterday! The data insights were particularly compelling. I have a few suggestions for the next session to make it even more impactful. Let’s schedule a brief meeting to discuss them.

Best,
Sophie""",
        
        """Subject: New Hire Orientation Schedule

Dear Team,

We are pleased to welcome our new team members starting next Monday. Attached is the orientation schedule for their first week. Please ensure that all necessary preparations are made to facilitate a smooth onboarding process.

Thank you,
HR Department""",
        
        """Subject: Client Meeting Follow-Up

Hello Michael,

Thank you for meeting with us yesterday regarding the partnership opportunities. As discussed, I’ve attached the proposal outlining our collaboration plan. Please review and let me know your thoughts by Friday.

Best regards,
Linda""",
        
        """Subject: Office Renovation Updates

Hi All,

I wanted to update you on the ongoing office renovations. The main conference room will be closed for the next two weeks, but alternative spaces have been arranged. Please refer to the attached map for the new setup.

Apologies for any inconvenience and thank you for your patience.

Regards,
Facilities Management""",
        
        """Subject: Training Workshop on Data Analytics

Dear Colleagues,

We are organizing a training workshop on advanced data analytics on September 10th from 10 AM to 4 PM. This workshop will cover the latest tools and techniques in data analysis. Seats are limited, so please register by August 25th.

Best,
Training Coordinator""",
        
        """Subject: Performance Bonus Announcement

Hello Team,

I’m thrilled to announce that due to our outstanding performance this quarter, the company has decided to award performance bonuses to all eligible employees. Details will be shared during the town hall meeting next Wednesday.

Congratulations to everyone for your hard work!

Best wishes,
CEO""",
        
        """Subject: Supply Chain Meeting Rescheduled

Hi Team,

Please note that the supply chain strategy meeting originally scheduled for Thursday has been moved to Friday at 2 PM in Conference Room B. Let me know if this new time poses any conflicts.

Thanks,
Operations Manager""",
        
        """Subject: Policy Change: Remote Work Guidelines

Dear Staff,

Effective July 1st, we are updating our remote work policy to provide more flexibility. Attached is the revised document outlining the new guidelines and procedures. Please review it and reach out with any questions.

Best regards,
HR Team""",
        
        """Subject: Security Alert: Phishing Attempt

Hi Everyone,

Be aware of a recent phishing attempt targeting our company email accounts. Do not click on any suspicious links or provide personal information. If you receive a dubious email, please report it to the IT department immediately.

Stay safe,
Cybersecurity Team""",
        
        """Subject: Invitation to Speak at Tech Conference

Dear Dr. Thompson,

We are honored to invite you as a keynote speaker at the upcoming International Tech Conference on October 12th. Your expertise in artificial intelligence would be a valuable addition to our event. Please let us know your availability.

Sincerely,
Event Coordinator"""
    ]
    
    # Corresponding summary labels
    labels = [
        "Schedule project kickoff meeting next Monday at 9 AM.",
        "Requesting budget approval for the marketing campaign by weeks end.",
        "June sales report shows a 15% revenue increase.",
        "Scheduled IT maintenance this Saturday from 10 PM to 4 AM.",
        "Invitation to Annual Company Retreat from August 5th to 7th.",
        "Positive feedback on presentation with suggestions for improvement.",
        "Orientation schedule for new hires starting next Monday.",
        "Follow-up on client meeting with attached collaboration proposal.",
        "Update on office renovations and alternative conference room arrangements.",
        "Invitation to data analytics training workshop on September 10th.",
        "Announcement of performance bonuses due to outstanding quarterly results.",
        "Rescheduled supply chain strategy meeting to Friday at 2 PM.",
        "Updated remote work policy effective July 1st.",
        "Security alert about a phishing attempt targeting company emails.",
        "Invitation to Dr. Thompson to speak at International Tech Conference."
    ]
    
    # Preprocessing
    X, y = email_summarization_tool.preprocess_data(emails, labels)
    
    # Cross-validation
    cv_results = email_summarization_tool.train_with_cross_validation(X, y)
    print("Cross-Validation Results:")
    print(cv_results)
    print("\n")
    
    # Ablation study
    ablation_results = email_summarization_tool.ablation_study(emails, labels)
    print("Ablation Study Results:")
    for config, result in ablation_results.items():
        print(f"Config: {config}, Results: {result}")
    print("\n")
    
    # Bias-variance analysis
    bias_variance_results = email_summarization_tool.bias_variance_decomposition(X, y)
    print("Bias-Variance Decomposition Results:")
    print(bias_variance_results)
    print("\n")
    
    # Visualize results
    email_summarization_tool.visualize_results(cv_results)
    print("Results visualization saved to 'results_visualization.png'.")

if __name__ == "__main__":
    main()


Cross-Validation Results:
{'random_forest': {'mean_cv_score': 36.1281, 'std_cv_score': 22.375644866029372}, 'gradient_boosting': {'mean_cv_score': 60.524527966360154, 'std_cv_score': 18.53555830456647}, 'ridge_regression': {'mean_cv_score': 26.703309075302855, 'std_cv_score': 17.689221212787427}}


Ablation Study Results:
Config: {'feature_dim': 50}, Results: {'random_forest': {'mean_cv_score': 36.1281, 'std_cv_score': 22.375644866029372}, 'gradient_boosting': {'mean_cv_score': 60.524527966360154, 'std_cv_score': 18.53555830456647}, 'ridge_regression': {'mean_cv_score': 26.703309075302844, 'std_cv_score': 17.689221212787423}}
Config: {'feature_dim': 100}, Results: {'random_forest': {'mean_cv_score': 36.1281, 'std_cv_score': 22.375644866029372}, 'gradient_boosting': {'mean_cv_score': 60.524527966360154, 'std_cv_score': 18.53555830456647}, 'ridge_regression': {'mean_cv_score': 26.703309075302865, 'std_cv_score': 17.689221212787444}}
Config: {'feature_dim': 150}, Results: {'random_forest'

## Results 

**Summary of the Results:**

1. **Cross-Validation Performance:**
   - **Random Forest:** Achieved a mean cross-validation MSE of approximately 36.13 with a standard deviation of about 22.38. The relatively high variance suggests that its performance is somewhat inconsistent across folds.
   - **Gradient Boosting:** Had a mean MSE of roughly 60.52 with a standard deviation of about 18.54. Its error is notably higher than the other models, indicating weaker generalization.
   - **Ridge Regression:** Showed the best performance among the three, with a mean MSE around 26.70 and a standard deviation of about 17.69. This model consistently outperformed the other two on the validation sets.

2. **Ablation Study Results:**
   - The ablation study examined the impact of altering feature dimensionality, regularization strength, and the number of TF-IDF features.
   - **Effect on Random Forest and Gradient Boosting:** Their performance remained relatively stable across all tested configurations. Neither model showed significant improvement or degradation in mean MSE, suggesting they are not highly sensitive to the tested hyperparameter changes within the given range.
   - **Effect on Ridge Regression:** Changes to regularization strength and TF-IDF features slightly influenced ridge regression results. For instance, increasing regularization strength to 10.0 or altering TF-IDF max features yielded a marginal improvement (mean MSE around 26.49), though differences were subtle. Overall, Ridge Regression remained the most robust and best-performing model across configurations.

3. **Bias-Variance Decomposition:**
   - **Random Forest:** Exhibited a low training error (~4.51) but a much higher validation error (~19.97), indicating some degree of overfitting. While it generalizes better than Gradient Boosting, it still struggles to match its training performance.
   - **Gradient Boosting:** Nearly perfect on training (effectively zero error), but its validation error (~39.90) is very high. This extreme discrepancy highlights severe overfitting, as the model fails to generalize beyond its training data.
   - **Ridge Regression:** Demonstrated a balanced behavior with a low training error (~0.08) and a moderate validation error (~9.73). The smaller gap between training and validation errors suggests a better bias-variance trade-off, making it the most stable and generalizable model of the three.

**In essence, Ridge Regression consistently outperforms Random Forest and Gradient Boosting, is more robust to configuration changes, and maintains a good balance between bias and variance. Random Forest and Gradient Boosting, while competitive on training sets, suffer more from overfitting issues and do not adapt as well to modifications in feature dimensions or regularization settings.**