# 📖 Topic Modeling with LDA
 
![](https://storage.googleapis.com/kaggle-competitions/kaggle/31779/logos/header.png)

## Simple topic modeling over the [Feedback Prize - Evaluating Student Writing](https://www.kaggle.com/c/feedback-prize-2021) data using `CountVectorizer` and `LDA`.


Adapted from scikit's Topic Modeling documentation script: https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html


It mimics [RAPIDS UMAP Tfidf KMeans - Discovers 15 Topics!](https://www.kaggle.com/cdeotte/rapids-umap-tfidf-kmeans-discovers-15-topics) with LDA.

## Imports

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Load texts

In [None]:
def load_df():
    # https://www.kaggle.com/raghavendrakotala/fine-tunned-on-roberta-base-as-ner-problem-0-533
    train_names, train_texts = [], []
    for f in tqdm(list(os.listdir('../input/feedback-prize-2021/train'))):
        train_names.append(f.replace('.txt', ''))
        train_texts.append(open('../input/feedback-prize-2021/train/' + f, 'r').read())
    train_text_df = pd.DataFrame({'id': train_names, 'text': train_texts})
    return train_text_df

df = load_df()
df.head()

# Configuration

In [None]:
n_features = 1000
n_topics = 10
n_top_words = 20

# Count Vectorizer

In [None]:
%%time
vect = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words="english")
X = vect.fit_transform(df.text.tolist())

# LDA

In [None]:
%%time
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5,
                                learning_method="online",
                                learning_offset=50.0,
                                random_state=42,
                                n_jobs=-1)
labels = lda.fit_transform(X)
labels = labels.argmax(-1)

# Plot

In [None]:
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()


feature_names = vect.get_feature_names()
plot_top_words(lda, feature_names, n_top_words, "Topics in LDA model")

# Examples

In [None]:
df['topic'] = labels
df.head()

In [None]:
df['topic'].value_counts()

In [None]:
for topic_idx in range(n_topics):
    topic = lda.components_[topic_idx]
    top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
    top_features = [feature_names[i] for i in top_features_ind]
    samples = df[df['topic'] == topic_idx].sample(5)
    print("=========================================================")
    print(f"TOPIC {topic_idx + 1}")
    print(f"  Top words: {top_features}")
    print("=========================================================")
    for sample_idx, sample in enumerate(samples['text'].tolist(), 1):
        print(f"Example {sample_idx}:")
        print(sample)
        print()
        print('---------------------')
        print()
    print()
    print()
    print()
    