In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

class FeatureExtractor:
    def __init__(self, max_features=5000, ngram_range=(1, 2)):
        self.max_features = max_features
        self.ngram_range = ngram_range
        self.bow_vectorizer = CountVectorizer(
            max_features=max_features,
            ngram_range=ngram_range,
            min_df=2,           # Ignore words in <2 docs
            max_df=0.8,         # Ignore words in >80% docs
            stop_words='english'
        )
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=max_features,
            ngram_range=ngram_range,
            min_df=2,
            max_df=0.8,
            stop_words='english'
        )

    def fit_transform_bow(self, texts):

        X_bow = self.bow_vectorizer.fit_transform(texts)
        feature_names_bow = self.bow_vectorizer.get_feature_names_out()
        return X_bow, feature_names_bow

    def fit_transform_tfidf(self, texts):

        X_tfidf = self.tfidf_vectorizer.fit_transform(texts)
        feature_names_tfidf = self.tfidf_vectorizer.get_feature_names_out()
        return X_tfidf, feature_names_tfidf

    def get_top_features(self, vectorizer, X, n=20):

        # Ensure X is not empty or all zeros
        if X.sum() == 0:
            return []

        feature_scores = np.array(X.sum(axis=0)).flatten()
        feature_names = vectorizer.get_feature_names_out()

        df = pd.DataFrame({'feature': feature_names, 'score': feature_scores})
        top_features = df.nlargest(n, 'score')

        return list(zip(top_features['feature'], top_features['score']))

    def visualize_top_features(self, vectorizer, X, n=20):
        top_features = self.get_top_features(vectorizer, X, n)
        if not top_features:
            print("No features to visualize.")
            return None

        features, scores = zip(*top_features)

        fig, ax = plt.subplots(figsize=(10, 6))
        # Reverse for plotting so highest score is at the top
        ax.barh(features[::-1], scores[::-1], color='skyblue')
        ax.set_xlabel('Score')
        ax.set_ylabel('Feature')
        ax.set_title(f'Top {n} Features')
        plt.tight_layout()
        return fig