# Poincaré Project: IMDB Embeddings

In [18]:
! pip install -q datasets nltk bertopic

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/143.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.8/90.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m39.0 MB/s[0m eta [3

In [19]:
import pandas as pd
import datasets as ds
from bs4 import BeautifulSoup
import re
from collections import Counter
from bertopic import BERTopic
from umap import UMAP

\# Todo Write Notebook Goal

## Prepare Data

### Get IMDB dataset

In [10]:
train_split_name = ds.get_dataset_split_names('imdb')[0]
train_dataset = ds.load_dataset('imdb', split=train_split_name)
train_data_frame = train_dataset.to_pandas()

In [12]:
"""
Functions from https://github.com/Bast-94/NLP-2023/blob/main/Non-Deep/scripts/data.py
"""
def clean_html(text: str) -> str:
    """
    Removes HTML tags from the given text.
    Args:
        text (str): Text with html tags.
    Returns:
        str: Text from all html tags.
    """
    no_html = BeautifulSoup(text, "html.parser").get_text()
    return no_html

def text_processing(text: str) -> str:
  """
  Pre-processes the given text.
  Args:
      text (str): Text to process
  Returns:
      str: Processed text
  """
  result_text = text
  result_text = clean_html(result_text)
  result_text = result_text.lower()
  pattern = r"(?<![a-zA-Z])[^\w\s]|[^\w\s](?![a-zA-Z])"
  result_text = re.sub(pattern, "", result_text)
  result_text = result_text.strip()
  return re.sub("(\s+)", " ", result_text)

In [14]:
train_data_frame['text'] = train_data_frame['text'].apply(text_processing)

  no_html = BeautifulSoup(text, "html.parser").get_text()


In [15]:
train_data_frame.head()

Unnamed: 0,text,label
0,i rented i am curious-yellow from my video sto...,0
1,i am curious yellow is a risible and pretentio...,0
2,if only to avoid making this type of film in t...,0
3,this film was probably inspired by godard's ma...,0
4,oh brotherafter hearing about this ridiculous ...,0


In [17]:
"""
Function from https://github.com/Bast-94/NLP-2023/blob/main/Non-Deep/scripts/naive_bayes/from_scratch.py
"""
def tokenize(text: str)-> list:
    """
    Splits the given text into tokens.
    Args:
        text (str): Text to tokenize (pre-processed)
    Returns:
        list: List of tokens
    """
    return [w for w in re.split("\W+", text)]
def build_vocabulary(texts_serie: pd.Series) -> Counter:
    """
    Builds the vocabulary of the given texts serie.
    Args:
        text_serie (pd.Series): Text serie
    Returns:
        Counter: Vocabulary
    """
    vocabulary: Counter = None # Use Counter as a dictionary with word occurrences
    for text in texts_serie:
        word_list: list[str] = tokenize(text=text)
        if vocabulary is None:
            vocabulary = Counter(word_list)
        else:
            vocabulary.update(word_list)
    return vocabulary

In [25]:
get_n_words = lambda text: len(text.split(' '))

train_data_frame['text'].apply(get_n_words).describe()

count    25000.000000
mean       228.776840
std        169.926767
min         10.000000
25%        125.000000
50%        171.000000
75%        278.000000
max       2450.000000
Name: text, dtype: float64

In [28]:
train_docs = train_data_frame['text'][:2500]
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True, embedding_model="all-MiniLM-L6-v2")
topics, probs = topic_model.fit_transform(train_docs)

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

2023-11-15 09:23:32,854 - BERTopic - Transformed documents to Embeddings
2023-11-15 09:23:55,015 - BERTopic - Reduced dimensionality
2023-11-15 09:23:55,609 - BERTopic - Clustered reduced embeddings


In [33]:
embeddings = []
for topic in topic_model.get_topics().values():
    topic_embeddings = []
    for i in range(len(topic) - 1):
        for j in range(i + 1, len(topic)):
            topic_embeddings.append([topic[j][0], topic[i][0]])
    embeddings.append(topic_embeddings)
print(embeddings[5])

[['of', 'the'], ['western', 'the'], ['and', 'the'], ['in', 'the'], ['to', 'the'], ['was', 'the'], ['as', 'the'], ['is', 'the'], ['that', 'the'], ['western', 'of'], ['and', 'of'], ['in', 'of'], ['to', 'of'], ['was', 'of'], ['as', 'of'], ['is', 'of'], ['that', 'of'], ['and', 'western'], ['in', 'western'], ['to', 'western'], ['was', 'western'], ['as', 'western'], ['is', 'western'], ['that', 'western'], ['in', 'and'], ['to', 'and'], ['was', 'and'], ['as', 'and'], ['is', 'and'], ['that', 'and'], ['to', 'in'], ['was', 'in'], ['as', 'in'], ['is', 'in'], ['that', 'in'], ['was', 'to'], ['as', 'to'], ['is', 'to'], ['that', 'to'], ['as', 'was'], ['is', 'was'], ['that', 'was'], ['is', 'as'], ['that', 'as'], ['that', 'is']]
