In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    precision_recall_fscore_support, 
    multilabel_confusion_matrix, 
    classification_report
)
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import random
import joblib  # For saving the model

# Updated Data Augmenter Class
class DataAugmenter:
    @staticmethod
    def synonym_replacement(text, n=1):
        """Replace n random words with modified version"""
        words = text.split()
        if len(words) < n:
            return text
        
        for _ in range(n):
            idx = random.randint(0, len(words) - 1)
            words[idx] = words[idx].upper()  # Simple modification
        
        return ' '.join(words)
    
    @staticmethod
    def back_translation(text):
        """Simulate back translation by slightly modifying the text"""
        words = text.split()
        if len(words) < 3:
            return text
        
        mid = len(words) // 2
        words[mid-1:mid+2] = words[mid+1:mid-1:-1]
        return ' '.join(words)

# Updated Classifier for EV Charging Data
class EVChargingClassifier:
    def __init__(self):
        nltk.download('stopwords', quiet=True)
        nltk.download('wordnet', quiet=True)
        
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.augmenter = DataAugmenter()
    
    def preprocess_text(self, text):
        """Clean and preprocess text data"""
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        words = text.split()
        cleaned_words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
        return ' '.join(cleaned_words)
    
    def augment_data(self, X, y):
        """
        Perform data augmentation with class distribution analysis
        """
        # Analyze class distribution before augmentation
        class_distribution_before = np.sum(y, axis=0)
        
        augmented_X = list(X)
        augmented_y = list(y)
        
        # Identify minority classes
        minority_indices = np.where(class_distribution_before < np.median(class_distribution_before))[0]
        
        # Augmentation for minority classes
        for idx in minority_indices:
            # Find instances of this minority class
            minority_sample_indices = np.where(y[:, idx] == 1)[0]
            
            for sample_idx in minority_sample_indices:
                # Apply augmentation techniques
                aug_text = random.choice([self.augmenter.synonym_replacement(X[sample_idx]), 
                                          self.augmenter.back_translation(X[sample_idx])])
                
                augmented_X.append(aug_text)
                augmented_y.append(y[sample_idx])
        
        # Convert back to numpy arrays
        X_augmented = np.array(augmented_X)
        y_augmented = np.array(augmented_y)
        
        # Analyze class distribution after augmentation
        class_distribution_after = np.sum(y_augmented, axis=0)
        
        # Visualization of class distribution
        self.plot_class_distribution(
            class_distribution_before, 
            class_distribution_after, 
            minority_indices
        )
        
        return X_augmented, y_augmented
    
    def plot_class_distribution(self, before, after, minority_indices):
        """
        Visualize class distribution before and after augmentation
        """
        plt.figure(figsize=(12, 6))
        
        # Create bar plot
        plt.bar(
            range(len(before)), 
            before, 
            alpha=0.5, 
            label='Before Augmentation',
            color='blue'
        )
        plt.bar(
            range(len(after)), 
            after, 
            alpha=0.5, 
            label='After Augmentation',
            color='red'
        )
        
        # Highlight minority classes
        for idx in minority_indices:
            plt.text(
                idx, 
                after[idx], 
                f'↑{after[idx] - before[idx]}', 
                horizontalalignment='center',
                color='green'
            )
        
        plt.title('Class Distribution Before and After Augmentation')
        plt.xlabel('Classes')
        plt.ylabel('Number of Instances')
        plt.xticks(range(len(before)), range(1, len(before) + 1), rotation=45)
        plt.legend()
        plt.tight_layout()
        plt.savefig('class_distribution_augmentation.png')
        plt.close()

# Model Evaluation Class
class ModelEvaluator:
    def __init__(self, model, X_test, y_test, class_names):
        """
        Initialize the ModelEvaluator.

        Parameters:
        - model: The trained model pipeline.
        - X_test: The test features.
        - y_test: The test labels (multi-label).
        - class_names: A list of names corresponding to each class.
        """
        self.model = model
        self.X_test = X_test
        self.y_test = y_test
        self.class_names = class_names

    def detailed_evaluation(self):
        """Comprehensive model evaluation"""
        y_pred = self.model.predict(self.X_test)
        
        # Number of classes
        num_classes = self.y_test.shape[1]
        
        # Detailed classification report
        print("Detailed Classification Report:")
        print(classification_report(self.y_test, y_pred, target_names=self.class_names))

        # Confusion matrices
        cm = multilabel_confusion_matrix(self.y_test, y_pred)

        # Dynamically determine grid dimensions for subplots
        rows = (num_classes + 2) // 3  # Number of rows (3 columns max)
        plt.figure(figsize=(15, rows * 5))

        for i in range(num_classes):
            plt.subplot(rows, 3, i + 1)  # Adjust grid size dynamically
            sns.heatmap(cm[i], annot=True, fmt='d', cmap='Blues')
            plt.title(self.class_names[i])  # Use class names for subplot titles
            plt.xlabel("Predicted")
            plt.ylabel("True")

        plt.tight_layout()
        plt.savefig('confusion_matrices_with_class_names.png')
        plt.close()
# Main Workflow
def main():
    # Load dataset
    df = pd.read_csv('input.csv')
    
    # Initialize classifier
    classifier = EVChargingClassifier()
    
    # Preprocess text
    df['Preprocessed_Description'] = df['Description'].apply(classifier.preprocess_text)
    
    # Prepare labels (one-hot encoded)
    mlb_region = MultiLabelBinarizer()
    mlb_sector = MultiLabelBinarizer()
    mlb_feature = MultiLabelBinarizer()
    
    region_labels = mlb_region.fit_transform(df['Region'].apply(lambda x: [x]))
    sector_labels = mlb_sector.fit_transform(df['Sector'].apply(lambda x: [x]))
    feature_labels = mlb_feature.fit_transform(df['Feature_Type'].apply(lambda x: [x]))
    
    # Combine labels
    y = np.column_stack((region_labels, sector_labels, feature_labels))
    
    # Define class names
    class_names = (
        list(mlb_region.classes_) + 
        list(mlb_sector.classes_) + 
        list(mlb_feature.classes_)
    )
    
    # Augment data
    X_augmented, y_augmented = classifier.augment_data(
        df['Preprocessed_Description'].values, y
    )
    
    # Split augmented data
    X_train, X_test, y_train, y_test = train_test_split(
        X_augmented, y_augmented, test_size=0.2, random_state=42
    )
    
    # Create and train pipeline
    pipeline = Pipeline([ 
        ('tfidf', TfidfVectorizer(max_features=5000)),
        ('clf', MultiOutputClassifier(SVC(kernel='linear', random_state=42)))
    ])
    
    # Cross-validation scores
    cross_val_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
    print(f"Cross-Validation Scores: {cross_val_scores}")
    print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean()}")
    
    # Fit model
    pipeline.fit(X_train, y_train)
    
    # Save the model and encoders
    joblib.dump(pipeline, 'ev_charging_classifier_model.pkl')  # Save the model
    joblib.dump(mlb_region, 'mlb_region_encoder.pkl')  # Save region encoder
    joblib.dump(mlb_sector, 'mlb_sector_encoder.pkl')  # Save sector encoder
    joblib.dump(mlb_feature, 'mlb_feature_encoder.pkl')  # Save feature encoder
    
    print("Model and encoders saved successfully.")
    
    # Evaluate model
    evaluator = ModelEvaluator(pipeline, X_test, y_test, class_names)  # Pass class_names
    evaluator.detailed_evaluation()

if __name__ == '__main__':
    main()


Cross-Validation Scores: [0.64285714 0.67857143 0.66071429 0.57142857 0.60714286]
Mean Cross-Validation Accuracy: 0.6321428571428572
Detailed Classification Report:
                             precision    recall  f1-score   support

                     Africa       1.00      1.00      1.00        19
                       Asia       1.00      1.00      1.00         8
                     Europe       1.00      1.00      1.00        10
              North America       1.00      0.88      0.93        16
              South America       1.00      1.00      1.00        18
                 Automotive       1.00      1.00      1.00        20
                     Energy       1.00      1.00      1.00        10
                  Logistics       0.93      1.00      0.96        13
                 Technology       1.00      0.94      0.97        18
             Transportation       1.00      1.00      1.00        10
       Competition/Features       0.00      0.00      0.00         0
      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
