In [None]:
from google.colab import drive
import pandas as pd

In [None]:
# Import BERT-ready data from Gdrive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# git stuff

In [None]:
username = "TechLabs-Berlin"
repository = "ss23-drop-in-to-berlin"
git_token = ""

In [None]:
!git config --global user.email "phylanx@gmx.de"
!git config --global user.name "phylanxy"

In [None]:
!git checkout recommender

M	combined keyBERTopic extraction.ipynb
M	keyword extraction.ipynb
Already on 'recommender'
Your branch is up to date with 'origin/recommender'.


In [None]:
!git remote set-url origin https://{git_token}@github.com/{username}/{repository}

In [None]:
!git branch

  main[m
* [32mrecommender[m


In [None]:
!git add .

In [None]:
!git commit -m "combined keyBERTopic extraction"

In [None]:
!git push

In [None]:
cd /content/gdrive/MyDrive/TechLabs/{repository}

/content/gdrive/MyDrive/TechLabs/ss23-drop-in-to-berlin


# install needed packages

In [None]:
!pip install bertopic
!pip install transformers
!pip install keybert
!pip install keyphrase-vectorizers



# first build BERTopic and then build keyBERT

In [None]:
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from bertopic.representation import KeyBERTInspired
from hdbscan import HDBSCAN
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

## build list of input documents (reviews)

In [None]:
df = pd.read_csv("/content/gdrive/MyDrive/Ironhack/Final_project/df_with_model_input_no_NaNs.csv")

In [None]:
# function to remove items specified in a list
def preprocess_input(string, lst_to_remove):
  for item in lst_to_remove:
    string = string.replace(item, "")
  return str(string)

In [None]:
# create a list to remove strings that don't carry meaning
remove_lst = ["'review0': ","'review1': ","'review2': ","'review3': ","'review4': ", "'editorial_summary':", "restaurant", "place", "dinner", "meal", "meals"]

# convert everything into str, add the name of the restaurant for anything that's not a string and then create a list of texts for keyBERT to process
BERTs = df["model_input"].apply(lambda x: preprocess_input(str(x), remove_lst) if str(x) != "" else df["name"]).tolist()

# build BERTopic model based on the reviews

In [None]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
from sklearn.cluster import KMeans

In [None]:
# define stop words
lst = "i, me, my, myself, we, us, our, ours, ourselves, you, you're, you've, you'll, \
you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, \
herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, \
whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, \
has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, \
while, of, at, by, for, with, about, against, between, into, through, during, before, after, \
above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, \
there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, \
only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, \
d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, \
hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mustn't, needn, \
needn't, shan, shan't, shouldn, shouldn't, was, wasn, wasn't, weren, weren't, won, won't, wouldn, wouldn't,\
good, nice, food, restaurant, nice, good, place, really, also, the, and, to, was, restaurant, place"

stop_words_lst = lst.replace(" ","").split(",")

In [None]:
# define the BERTopic model parameters
#0 define diversity of topic clusters
#representation_model = MaximalMarginalRelevance(diversity=0.6)

#1 embedding model
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(BERTs, show_progress_bar=True)

#2 dimensionality reduction model
# n_components represents the level of dimensionality after reduction of dimensions with UMAP
# -> the higher this number, the more dimensions. Very high dimensionality will mess with hdbscan later
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False)

#3 clustering model
#hdbscan_model = KMeans(n_clusters=35)
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

#4 vectorizer model - transforming clusters into vector space
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stop_words_lst)

#5 c-TF-IDF creating the topics from the vectorized clusters
ctfidf_model = ClassTfidfTransformer() #bm25_weighting=True, reduce_frequent_words=True)

#6 let keyBERT create the topic labels
representation_model = KeyBERTInspired()

# We reduce our embeddings to 2D as it will allows us to quickly iterate later on
#reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

# assemble whole model
topic_model = BERTopic(embedding_model=sentence_model,
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       vectorizer_model=vectorizer_model,
                       representation_model=representation_model,
                       ctfidf_model=ctfidf_model,
                       min_topic_size=5,
                       verbose=True,
                       #calculate_probabilities=True
                       )

Batches:   0%|          | 0/135 [00:00<?, ?it/s]

In [None]:
from joblib import dump, load

# fit & transform the whole model
topics, probs = topic_model.fit_transform(BERTs)

# save model in a joblib file (safer than pickle)
dump(topic_model, '../MiniLM_eom_keyBERT_vectorized_UMAP_hdbscan_ctfidf_GPU.joblib')

Batches:   0%|          | 0/135 [00:00<?, ?it/s]

2023-09-06 17:34:18,300 - BERTopic - Transformed documents to Embeddings
2023-09-06 17:34:30,761 - BERTopic - Reduced dimensionality
2023-09-06 17:34:30,915 - BERTopic - Clustered reduced embeddings


['../MiniLM_eom_keyBERT_vectorized_UMAP_hdbscan_ctfidf_GPU.joblib']

In [None]:
topic_model.get_topic(1)

In [None]:
topic_model.get_topic_info()

In [None]:
keywords = list(set([word for sublist in topic_model.get_topic_info()["Representation"] for word in sublist]))

In [None]:
keywords

# code snippets (dump)

In [None]:
# Prepare documents
docs = BERTs

# initialize vectorizer with customized options for extracting zero or one adjective plus one or more nouns
vectorizer = KeyphraseCountVectorizer(pos_pattern='<J.*>{0,1}<N.*>+', stop_words="english")

# Extract keywords
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(docs, vectorizer=vectorizer, top_n=6)

# Create our vocabulary
vocabulary = [k[0] for keyword in keywords for k in keyword]
vocabulary = list(set(vocabulary))

2023-09-06 17:42:46,746 - KeyphraseVectorizer - INFO - It looks like you do not have downloaded a list of stopwords yet. It is attempted to download the stopwords now.
INFO:KeyphraseVectorizer:It looks like you do not have downloaded a list of stopwords yet. It is attempted to download the stopwords now.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
with open("unique_keywords.txt", "w") as output:
    output.write(str(keywords))

In [None]:
keywords

In [None]:
# build BERTopic model on top of keywords
vectorizer_model= CountVectorizer(vocabulary=vocabulary)
keyBERT_topic_model = BERTopic(vectorizer_model=vectorizer_model)
topics, probs = keyBERT_topic_model.fit_transform(docs)

2023-09-06 18:17:32,465 - BERTopic - Transformed documents to Embeddings
2023-09-06 18:17:45,790 - BERTopic - Reduced dimensionality
2023-09-06 18:17:45,940 - BERTopic - Clustered reduced embeddings
  idf = np.log((avg_nr_samples / df)+1)


ValueError: ignored

In [None]:
# save model in a joblib file (safer than pickle)
dump(topic_model, '../keyBERTopic_minimal_GPU.joblib')