In [None]:
import nltk
import json
import pandas as pd
from tqdm import tqdm
from umap import UMAP
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from sentence_transformers import SentenceTransformer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
 

In [None]:
import sys
print(sys.version)

In [None]:
import os
env_name = os.environ.get('CONDA_DEFAULT_ENV')
print(f"Active Conda environment: {env_name}")

In [None]:
nltk.download("wordnet")
# nltk.download("omw-1.4")

In [None]:
# Detecting Current Directory

import os
print(os.getcwd())  # Get current working directory
print(os.listdir("."))  # List all files in the current directory


In [None]:
# Open the file in read mode
with open("I:/11_DFKI_Hiwi/Work/01_Code/Graphusion/inputs/abstracts.txt", "r") as file:
    # Read all lines into a list
    texts = file.readlines()

# Remove trailing newline characters from each line
texts = [line.strip() for line in texts]

print(len(texts))


In [None]:
# Print the list
for line in texts[0:5]:
    print(line)

In [None]:
# create BERTopic Extractor
umap_model=UMAP(n_neighbors=20,n_components=50,metric="cosine",min_dist=0.0,random_state=37)
vectorizer_model=CountVectorizer(ngram_range=(2,4),stop_words="english")
ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=False)
sentence_model=SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
representation_model = KeyBERTInspired()

topic_model=BERTopic(verbose=True,
                     umap_model=umap_model,
                     ctfidf_model=ctfidf_model,
                     vectorizer_model=vectorizer_model,
                     embedding_model=sentence_model,
                     representation_model=representation_model,
                     nr_topics=50,
                     low_memory=True,
                     calculate_probabilities=False)

In [None]:
topics, _ = topic_model.fit_transform(texts)

In [None]:
all_topics = topic_model.get_topics()

In [None]:
concepts=[]

for topic_num, keywords in all_topics.items():
    if topic_num != -1:
        topic_keywords = [word for word, value in keywords]
        concepts.extend(topic_keywords)

In [None]:
# remove duplicates
concepts = list(set(keyword.lower() for keyword in concepts))

In [None]:
len(concepts)

In [None]:
with open("extracted_concepts.tsv", "w") as f:
    for id, concept in enumerate(concepts, 1):
        f.write(f"{id}|{concept}\n")

In [None]:
extracted_concepts = pd.read_csv("extracted_concepts.tsv", delimiter="|", header=None)
extracted_concepts = extracted_concepts[1].tolist()

In [None]:
len(extracted_concepts)

In [None]:
lemmatizer = WordNetLemmatizer()

def singularize_concept(concept):
    words = concept.split()
    singular_words = [lemmatizer.lemmatize(word, wordnet.NOUN) for word in words]
    return ' '.join(singular_words)

In [None]:
# singularize concepts
extracted_concept = [singularize_concept(concept) for concept in extracted_concepts]

# convert to lowercase
extracted_concept = [concept.lower() for concept in extracted_concept]

In [None]:
len(extracted_concepts)

In [None]:
with open("extracted_concepts_upd.tsv", "w") as f:
    for id, concept in enumerate(concepts, 1):
        f.write(f"{id}|{concept}\n")

In [None]:
# create dataframe (column label indicated the source of the concept: 0=extracted, 1=gold)
df_old = pd.DataFrame(extracted_concept, columns=["concept"])
df_old["label"] = 0

# df_new = pd.DataFrame(gold_concept, columns=["concept"])
# df_new["label"] = 1

# df = pd.concat([df_old, df_new])
# df = df.sort_values(by="label")

df = df_old.sort_values(by="label")
df = df.drop_duplicates(subset="concept", keep="first")

In [None]:
df.shape

In [None]:
# reduce the text dataset to only texts containing the concepts

def filter_abstracts_by_term(term, abstracts, threshold=70):
    filtered_abstracts = []
    for abstract in abstracts:
        if isinstance(abstract, str):
            if fuzz.partial_ratio(term.lower(), abstract.lower()) >= threshold:
                filtered_abstracts.append(abstract)
    return filtered_abstracts

concept_abstracts = {}
for index, row in tqdm(df.iterrows(), desc="Processing concepts", total=df.shape[0]):
    concept = row["concept"]
    label = row["label"]
    filtered_abstracts = filter_abstracts_by_term(concept, texts)
    concept_abstracts[concept] = {
        "abstracts": filtered_abstracts,
        "label": label
    }

In [None]:
label_0_count = sum(1 for details in concept_abstracts.values() if details['label'] == 0)
print(f"Number of concepts added through BERTopic: {label_0_count}")

In [None]:
empty_abstracts_count = sum(1 for details in concept_abstracts.values() if not details['abstracts'])
print(f"Number of concepts with empty filtered_abstracts: {empty_abstracts_count}")

In [None]:
output_file_path = "concept_abstracts.json"
with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(concept_abstracts, f, ensure_ascii=False, indent=4)

In [None]:
# To check if the file was saved correctly

with open(output_file_path, 'r', encoding='utf-8') as f:
    loaded_concept_abstracts = json.load(f)

print(f"Number of concepts in loaded file: {len(loaded_concept_abstracts)}")