In [1]:
# !pip install pandas plotly bertopic spacy

# EDA

In [1]:
import os

os.chdir("../")

import pandas as pd
import plotly.express as px
from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
from lib.config import config
from lib.utils.utils import seed_everything
from lib.paths import Paths

In [3]:
seed_everything()

Loading datasets

In [4]:
train_df = pd.read_csv("data/competition-data/train.csv")
test_df = pd.read_csv("data/competition-data/test.csv")

train_df.shape, test_df.shape

((17307, 3), (3, 2))

In [5]:
train_df.columns

Index(['essay_id', 'full_text', 'score'], dtype='object')

In [6]:
train_df["full_text"] = train_df["full_text"].map(lambda x: x.strip())
test_df["full_text"] = test_df["full_text"].map(lambda x: x.strip())

### Topics

Followed [BERTopic best practices](https://maartengr.github.io/BERTopic/getting_started/best_practices/best_practices.html#additional-representations)

In [7]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(train_df["full_text"], show_progress_bar=True)

Batches: 100%|██████████| 541/541 [00:16<00:00, 32.14it/s]


In [8]:
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    random_state=config.random_seed,
)

In [9]:
hdbscan_model = HDBSCAN(
    min_cluster_size=150,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)

In [10]:
vectorizer_model = CountVectorizer(
    stop_words="english",
    min_df=2,
    ngram_range=(1, 2),
)

In [11]:
topic_model = BERTopic(
    # Pipeline models
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    # Hyperparameters
    top_n_words=10,
    verbose=True,
)

# Train model
topics, probs = topic_model.fit_transform(train_df["full_text"], embeddings)

2024-05-02 19:00:56,119 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-02 19:01:20,807 - BERTopic - Dimensionality - Completed ✓
2024-05-02 19:01:20,808 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

In [12]:
for i in topic_model.get_topics().keys():
    print(
        f"Topic {i}: "
        + ",".join(topic_model.get_topic_info(i)["Representation"].tolist()[0])
    )

Topic 0: cars,driverless,car,driverless cars,driver,driving,drive,people,road,technology
Topic 1: students,emotions,technology,facial,computer,student,help,classroom,coding,action
Topic 2: venus,planet,earth,author,surface,dangers,like,study,humans,studying
Topic 3: face,mars,landform,aliens,natural,nasa,just,alien,picture,like
Topic 4: vote,college,president,votes,states,popular,election,state,people,voting
Topic 5: car,cars,people,usage,pollution,car usage,smog,air,driving,limiting
Topic 6: luke,animals,program,join,fun,people,help,time,world,places


In [13]:
topic_model.set_topic_labels({
    0: "Driverless",
    1: "Students",
    2: "Venus",
    3: "Mars",
    4: "Vote",
    5: "Pollution",
    6: "World",
})

In [14]:
topic_model.visualize_topics(custom_labels=True)

In [15]:
topic_model.visualize_hierarchy(custom_labels=True)

In [16]:
topics = topic_model.transform(train_df["full_text"])

Batches: 100%|██████████| 541/541 [00:16<00:00, 33.75it/s]
2024-05-02 19:02:12,419 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-05-02 19:02:12,448 - BERTopic - Dimensionality - Completed ✓
2024-05-02 19:02:12,449 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-05-02 19:02:13,509 - BERTopic - Cluster - Completed ✓


In [17]:
train_df["topic"] = topics[0]
train_df["topic"] = train_df["topic"].map(lambda x: topic_model.get_topic_info(x)["CustomName"][0])

In [18]:
train_df["topic"].value_counts(sort=True)

topic
Driverless    3499
Students      3043
Venus         3017
Mars          2094
Vote          2046
Pollution     1960
World         1648
Name: count, dtype: int64

In [19]:
test_df["topic"] = topic_model.transform(test_df["full_text"])[0]
test_df["topic"] = test_df["topic"].map(lambda x: topic_model.get_topic_info(x)["CustomName"][0])

Batches: 100%|██████████| 1/1 [00:00<00:00, 18.16it/s]
2024-05-02 19:02:43,845 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-05-02 19:02:54,851 - BERTopic - Dimensionality - Completed ✓
2024-05-02 19:02:54,852 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-05-02 19:02:54,852 - BERTopic - Cluster - Completed ✓


### Average Length of Essays

In [20]:
train_df["char_count"] = train_df["full_text"].map(lambda x: len(x))
train_df.loc[:, "char_count"].describe()

count    17307.000000
mean      2050.255504
std        866.264388
min        711.000000
25%       1387.000000
50%       1910.000000
75%       2526.000000
max       8072.000000
Name: char_count, dtype: float64

Max 8K length essays

In [21]:
px.box(
    train_df,
    x="char_count",
    color="score",
)

In [22]:
px.box(
    train_df,
    x="char_count",
    color="topic",
)

In [23]:
test_df["char_count"] = test_df["full_text"].map(lambda x: len(x))
test_df.loc[:, "char_count"]

0    2673
1    1669
2    3068
Name: char_count, dtype: int64

### Score Distribution

In [24]:
px.histogram(train_df, x="score", color="topic", barmode="group")

In [26]:
train_df.to_csv(Paths.TRAIN_CSV_PATH, index=False)
test_df.to_csv(Paths.TEST_CSV_PATH, index=False)