In [None]:
import nltk
import json
import pandas as pd
from tqdm import tqdm
from umap import UMAP
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from sentence_transformers import SentenceTransformer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
 

In [None]:
import sys
print(sys.version)

In [None]:
import os
env_name = os.environ.get('CONDA_DEFAULT_ENV')
print(f"Active Conda environment: {env_name}")

In [None]:
nltk.download("wordnet")
# nltk.download("omw-1.4")

In [None]:
# Detecting Current Directory

import os
print(os.getcwd())  # Get current working directory
print(os.listdir("."))  # List all files in the current directory


In [None]:
# Open the file in read mode
with open("I:/11_DFKI_Hiwi/Work/01_Code/Graphusion/inputs/abstracts.txt", "r") as file:
    # Read all lines into a list
    texts = file.readlines()

# Remove trailing newline characters from each line
texts = [line.strip() for line in texts]

print(len(texts))


In [None]:
# Print the list
for line in texts[0:5]:
    print(line)

In [None]:
# create BERTopic Extractor
umap_model=UMAP(n_neighbors=20,n_components=50,metric="cosine",min_dist=0.0,random_state=37)
vectorizer_model=CountVectorizer(ngram_range=(2,4),stop_words="english")
ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=False)
sentence_model=SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
representation_model = KeyBERTInspired()

topic_model=BERTopic(verbose=True,
                     umap_model=umap_model,
                     ctfidf_model=ctfidf_model,
                     vectorizer_model=vectorizer_model,
                     embedding_model=sentence_model,
                     representation_model=representation_model,
                     nr_topics=50,
                     low_memory=True,
                     calculate_probabilities=False)

In [None]:
topics, _ = topic_model.fit_transform(texts)

In [None]:
all_topics = topic_model.get_topics()

In [None]:
concepts=[]

for topic_num, keywords in all_topics.items():
    if topic_num != -1:
        topic_keywords = [word for word, value in keywords]
        concepts.extend(topic_keywords)

In [None]:
# remove duplicates
concepts = list(set(keyword.lower() for keyword in concepts))

In [None]:
len(concepts)

In [None]:
with open("extracted_concepts.tsv", "w") as f:
    for id, concept in enumerate(concepts, 1):
        f.write(f"{id}|{concept}\n")

In [None]:
extracted_concepts = pd.read_csv("extracted_concepts.tsv", delimiter="|", header=None)
extracted_concepts = extracted_concepts[1].tolist()

In [None]:
len(extracted_concepts)