In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
transcripts = pd.read_csv("transcripts_dataset_final.csv.gz", compression="gzip")
print(transcripts.shape)
transcripts.head(2)

In [None]:
# Check for avg_confidence

x = transcripts.avg_confidence

# Plot the histogram using Matplotlib
plt.hist(x, bins=20)
plt.title('Histogram of Avg Confidence')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Check for avg_confidence

x = transcripts.word_count

# Plot the histogram using Matplotlib
plt.hist(x, bins=5)
plt.title('Histogram of word count')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Check for number of transcripts deemed high quality
# High quality: Avg confidence > 0.85 and word count > 10000

count = len(transcripts.loc[(transcripts.avg_confidence > 0.85) & (transcripts.word_count > 7500)])

print(f"There are {count} instances in the dataframe with both condition1 and condition2.")

In [None]:
sample = transcripts.loc[(transcripts.avg_confidence > 0.85) & (transcripts.word_count > 7500)].copy()

print(sample.shape)
sample.head(3)

In [None]:
sample.show_name.value_counts(normalize=True)

In [None]:
# sample.to_csv('transcript_sample_mini.csv.gz', compression='gzip')

In [None]:
def tokenize_split(text):
    """simple tokeniser"""
    return text.split()

# Functions for chunking transcripts on either words or sentences
def word_chunk_transcript(transcripts, name_variable='transcript', chunk_size=256):
    """ transcripts: must have two columns - episode_id and tokenised_transcript
        chunk_size: number of tokens in chunk 
    """
    transcripts["tokenised_transcript"] = transcripts[name_variable].apply(lambda x: tokenize_split(x))

    episode_ls = []
    transcript_ls = []
    words_enum_ls = [] 
    for index, row in transcripts.iterrows():
        for i in range(0, len(row["tokenised_transcript"]), chunk_size):
            episode_ls.append(row["episode_id"])
            transcript_ls.append(" ".join(row["tokenised_transcript"][i:i+chunk_size]))
            words_enum_ls.append(f"{i} - {i+chunk_size}")
    word_chunked_df = pd.DataFrame(data = {'episode_id': episode_ls, 'transcript_subset':transcript_ls, 'words_enumerated':words_enum_ls})
    return word_chunked_df

In [None]:
# Extract columns for conversion
cols_subset = sample.loc[: ,["episode_id", "transcript"]]

In [None]:
sample_chunk = word_chunk_transcript(cols_subset)

In [None]:
sample_chunk.head()

In [None]:
docs = sample_chunk.transcript_subset.to_list()
print(len(docs))

## BERTopic

In [None]:
!pip3 install umap-learn

In [None]:
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from sentence_transformers import SentenceTransformer
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA, KernelPCA

In [None]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

umap_model = UMAP(
    n_neighbors=15, 
    n_components=5, 
    min_dist=0.0, 
    metric='cosine')

hdbscan_model = HDBSCAN(
    min_cluster_size = 10, # Limit at 400 clusters 
    metric='euclidean', # same as cosine for normalised data
    cluster_selection_method='eom', 
    prediction_data=True)

vectorizer_model = CountVectorizer(min_df=10, stop_words='english', ngram_range=(1,3))

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

representation_model = MaximalMarginalRelevance(diversity=0.7)

In [None]:
embeddings_mini = sentence_model.encode(docs, convert_to_numpy=True, show_progress_bar=True)

In [None]:
# np.save('embeddings_mini.npy', embeddings_mini)

### Run model

In [None]:
embeddings_mini = np.load('embeddings_mini.npy')

In [None]:
topic_model = BERTopic(
    # embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
    low_memory = True,
    calculate_probabilities=True, 
    verbose=True  # progress bar
)

topics, probs = topic_model.fit_transform(docs, embeddings_mini)

# Save model
topic_model.save("BERT_mini")

In [None]:
topic_model = BERTopic.load("BERT_mini")

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.visualize_barchart(top_n_topics=10)

### Segmentation

In [None]:
topics = topic_model.topics_
proba = topic_model.probabilities_

In [None]:
list_of_int = [i for i in range(len(sample_chunk))]
list_of_int

In [None]:
topics_over_time = topic_model.topics_over_time(docs, list_of_int)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time)

### Fine-tuning

In [None]:
# Reduce outliers
new_topics = topic_model.reduce_outliers(docs, topics, strategy='distributions')


In [None]:
topic_model.update_topics(docs, new_topics, top_n_words=10)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.visualize_barchart()