In [None]:
import json
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
from gensim import corpora, models
import gensim
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk.tokenize import word_tokenize

print("___imported___")

# Read JSON

In [None]:
# with open('fpmsdb.json', 'r', encoding='utf-8') as f:
#     data = json.load(f)
    
with open('fpmsdoecedb.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    
# print(data['features'][4115]['properties']['title'])

research_titles = []
for i in range(0, 565):
#     research_titles.append(data['features'][i]['properties']['title'])
    research_titles.append(data[i]['title'])

print(research_titles)

# Preprocessing & NLP

In [None]:
# download necessary resources from NLTK
# nltk.download('stopwords')
# nltk.download('wordnet')

# define stop words and lemmatizer
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

# preprocess function for text data
def preprocess(text):
    # tokenize the text and remove stop words and punctuation
    tokens = [word for word in nltk.word_tokenize(text.lower()) if word not in stop and word not in exclude]
    
    # extract collocations and add them to the token list
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(tokens)
    finder.apply_freq_filter(2)  # only consider bigrams that occur at least twice
    collocations = finder.nbest(bigram_measures.pmi, 10)  # extract the top 10 collocations
    for collocation in collocations:
        if collocation[0] in tokens and collocation[1] in tokens:
            tokens.append('_'.join(collocation))
    
    # lemmatize the tokens
    normalized = " ".join(lemma.lemmatize(word) for word in tokens)
    return normalized

# preprocess the titles
titles_preprocessed = [preprocess(title) for title in research_titles]

# create a dictionary and corpus
dictionary = corpora.Dictionary([doc.split() for doc in titles_preprocessed])
corpus = [dictionary.doc2bow(doc.split()) for doc in titles_preprocessed]

# build the LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=dictionary,
                                            num_topics=20,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto', # higher value => more concentrated topics
                                            per_word_topics=True)

# print the top 10 topics and their most significant words
topics = lda_model.show_topics(num_topics=20, num_words=3, formatted=False)
for topic in topics:
    print("Topic {}: {}".format(topic[0], ", ".join([word[0] for word in topic[1]])))

# Highest Probability Index

In [None]:
# categorize the research titles based on their highest probability topic
for i, title in enumerate(titles_preprocessed):
    bow = dictionary.doc2bow(preprocess(title).split())
    topic_probs = lda_model.get_document_topics(bow)
    topic_probs_sorted = sorted(topic_probs, key=lambda x: x[1], reverse=True)
    topic_num = topic_probs_sorted[0][0]
    print(research_titles[i])
    print("{} - Topic {}: {}".format(i+1, topic_num, ", ".join([word[0] for word in topics[topic_num][1]])))

In [62]:
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import CountVectorizer

# preprocess the titles
titles_preprocessed = [preprocess(title) for title in research_titles]

# create a document-term matrix using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(titles_preprocessed)

# categorize the titles into clusters using the LDA model
corpus_lda = lda_model[corpus]
doc_lda = [max(prob, key=lambda y: y[1]) for prob in corpus_lda]
doc_lda = np.array(doc_lda)
clusters = DBSCAN(eps=0.7, min_samples=2).fit_predict(doc_lda[:, 1].reshape(-1, 1))

# print the research titles with their cluster name
for i, title in enumerate(research_titles):
    print("Title: {}\nCluster: {}\n".format(title, clusters[i]))


TypeError: '>' not supported between instances of 'tuple' and 'int'

# DBSCAN

In [None]:
# extract topic probabilities for each document
doc_topic_probs = []
for doc in corpus:
    topic_probs = lda_model.get_document_topics(doc)
    topic_probs_dict = {topic_num: prob for topic_num, prob in topic_probs}
    doc_topic_probs.append(topic_probs_dict)

# convert the topic probabilities to a numpy array
doc_topic_probs_array = np.array(doc_topic_probs)

# use DBSCAN to cluster the documents based on their topic probabilities
dbscan = DBSCAN(eps=0.15, min_samples=5)
dbscan.fit(doc_topic_probs_array)

# print the clusters
for i, label in enumerate(dbscan.labels_):
    print("{} - Cluster {}".format(i+1, label))
    print(research_titles[i])
    print()