In [None]:
!pip install bertopic datasets accelerate bitsandbytes xformers adjustText torchvision -q

# Extraction

In [None]:
import numpy as np
import os
import pandas as pd

In [None]:
df = pd.read_csv('/kaggle/input/book-genre-prediction/data.csv')

df.head()

In [None]:
print("\nNombre de valeurs nulles par colonne :")
print(df.isnull().sum())

In [None]:
titles_base = df['title'].values
abstracts  = 'Genre: ' + df['genre'].values + '\n' + df['summary'].values

len(titles_base),len(abstracts)

# Pré-traitement

In [None]:
titles = []
for idx,title in enumerate(titles_base):
    titles.append(f'{title}_{idx}')

# Model

In [None]:
from huggingface_hub import login

huggingface_token = os.getenv('HUGGINGFACE_TOKEN')
login(token=huggingface_token)

In [None]:
from torch import cuda

model_id = 'mistralai/Mistral-7B-Instruct-v0.3'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

print(device)

In [None]:
from torch import bfloat16
import transformers

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16  # Computation type
)

In [None]:
# Tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# Model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)
model.eval()

In [None]:
generator = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)

## Prompt

In [None]:
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""

In [None]:
example_prompt = """
I have a topic that contains the following documents:

Drowned Wednesday is the first Trustee among the Morrow Days who is on Arthur's side and wishes the Will to be fulfilled. She appears as a leviathan/whale and suffers from Gluttony. The book begins when Leaf is visiting Arthur and they are discussing the invitation that Drowned Wednesday sent him.
Arthur journeys through the Border Sea, escapes pirates, meets anthropomorphic rats, and ultimately faces Feverfew in a deadly duel to rescue his friend Leaf and free the third part of the Will, which is in the form of a Carp trapped inside a worldlet in Wednesday's stomach.
The topic is described by the following keywords: 'fantasy, house, keys, arthur, trustees, border sea, drowned wednesday, feverfew, will, pirates, magic, transformation, quests, gluttony, worldlet'.
Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST] Fantasy Adventure: Arthur's Quest in the Border Sea to Free the Third Part of the Will
"""

In [None]:
main_prompt = """
[INST]
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST]
"""

In [None]:
prompt = system_prompt + example_prompt + main_prompt

# BERTopic

In [None]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedding_model.encode(abstracts, show_progress_bar=True)

In [None]:
from umap import UMAP
from hdbscan import HDBSCAN

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [None]:
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)

In [None]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration

# KeyBERT
keybert = KeyBERTInspired()

# MMR
mmr = MaximalMarginalRelevance(diversity=0.3)

# Text generation with Llama 2
mistral = TextGeneration(generator, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert,
    "Mistral": mistral,
    "MMR": mmr,
}

## Training

In [None]:
from bertopic import BERTopic

topic_model = BERTopic(

  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(abstracts, embeddings)

In [None]:
# Show topics
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(1, full=True)["KeyBERT"]

In [None]:
mistral_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Mistral"].values()]
topic_model.set_topic_labels(mistral_labels)

# Visualize

In [None]:
visualization = topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)

In [None]:
visualization.write_html("topic_visualization.html")

In [None]:
from IPython.core.display import display, HTML

with open("/kaggle/working/topic_visualization.html", "r", encoding="utf-8") as f:
    html_content = f.read()

display(HTML(html_content))