In [1]:
# !pip install pandas plotly bertopic spacy

# EDA

In [2]:
import os

os.chdir("../")

import pandas as pd
import plotly.express as px
from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from bertopic import BERTopic

Loading datasets

In [3]:
train_df = pd.read_csv("data/competition-data/train.csv")
test_df = pd.read_csv("data/competition-data/test.csv")

train_df.shape, test_df.shape

((17307, 3), (3, 2))

In [4]:
train_df.columns

Index(['essay_id', 'full_text', 'score'], dtype='object')

### Topics

Followed [BERTopic best practices](https://maartengr.github.io/BERTopic/getting_started/best_practices/best_practices.html#additional-representations)

In [9]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(train_df["full_text"], show_progress_bar=True)

Batches:   0%|          | 0/541 [00:00<?, ?it/s]

In [10]:
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    random_state=42,
)

In [11]:
hdbscan_model = HDBSCAN(
    min_cluster_size=150,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)

In [12]:
vectorizer_model = CountVectorizer(
    stop_words="english",
    min_df=2,
    ngram_range=(1, 2),
)

In [13]:
topic_model = BERTopic(
    # Pipeline models
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    # Hyperparameters
    top_n_words=10,
    verbose=True,
)

# Train model
topics, probs = topic_model.fit_transform(train_df["full_text"], embeddings)

2024-04-24 20:36:46,282 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-04-24 20:37:11,195 - BERTopic - Dimensionality - Completed ✓
2024-04-24 20:37:11,196 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

In [14]:
for i in topic_model.get_topics().keys():
    print(
        f"Topic {i}: "
        + ",".join(topic_model.get_topic_info(i)["Representation"].tolist()[0])
    )

Topic 0: cars,driverless,car,driverless cars,driver,driving,drive,people,road,technology
Topic 1: students,emotions,technology,facial,computer,student,help,classroom,coding,action
Topic 2: venus,planet,earth,author,surface,dangers,like,study,humans,studying
Topic 3: face,mars,landform,aliens,natural,nasa,just,alien,picture,like
Topic 4: vote,college,president,votes,states,popular,election,state,people,voting
Topic 5: car,cars,people,usage,pollution,car usage,smog,air,driving,limiting
Topic 6: luke,animals,program,join,fun,people,help,time,world,places


In [15]:
topic_model.set_topic_labels({
    0: "Driverless",
    1: "Students",
    2: "Venus",
    3: "Mars",
    4: "Vote",
    5: "Pollution",
    6: "World",
})

In [16]:
topic_model.visualize_topics(custom_labels=True)

In [17]:
topic_model.visualize_hierarchy(custom_labels=True)

In [20]:
topics = topic_model.transform(train_df["full_text"])

Batches:   0%|          | 0/541 [00:00<?, ?it/s]

2024-04-24 20:39:30,240 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-04-24 20:39:30,269 - BERTopic - Dimensionality - Completed ✓
2024-04-24 20:39:30,270 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-04-24 20:39:31,328 - BERTopic - Cluster - Completed ✓


In [28]:
train_df["topic"] = topics[0]
train_df["topic"] = train_df["topic"].map(lambda x: topic_model.get_topic_info(x)["CustomName"][0])

In [29]:
train_df["topic"].value_counts(sort=True)

topic
Driverless    3498
Students      3043
Venus         3017
Mars          2094
Vote          2046
Pollution     1961
World         1648
Name: count, dtype: int64

In [30]:
test_df["topic"] = topic_model.transform(test_df["full_text"])[0]
test_df["topic"] = test_df["topic"].map(lambda x: topic_model.get_topic_info(x)["CustomName"][0])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-04-24 20:43:31,580 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-04-24 20:43:44,261 - BERTopic - Dimensionality - Completed ✓
2024-04-24 20:43:44,261 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-04-24 20:43:44,262 - BERTopic - Cluster - Completed ✓


### Average Length of Essays

In [31]:
train_df["char_count"] = train_df["full_text"].map(lambda x: len(x))
train_df.loc[:, "char_count"].describe()

count    17307.000000
mean      2071.617265
std        925.910701
min        712.000000
25%       1397.000000
50%       1924.000000
75%       2541.000000
max      20459.000000
Name: char_count, dtype: float64

Max 20K length essays

In [32]:
px.box(
    train_df,
    x="char_count",
    color="score",
)

In [36]:
px.box(
    train_df,
    x="char_count",
    color="topic",
)

In [33]:
test_df["char_count"] = test_df["full_text"].map(lambda x: len(x))
test_df.loc[:, "char_count"]

0    2677
1    1669
2    3077
Name: char_count, dtype: int64

### Score Distribution

In [35]:
px.histogram(train_df, x="score", color="topic", barmode="group")