In [25]:
import json
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
from gensim import corpora, models
import gensim
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder

# Read JSON

In [None]:
with open('fpmsdb.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    
# print(data['features'][4115]['properties']['title'])

research_titles = []
for i in range(0, 4115):
    research_titles.append(data['features'][i]['properties']['title'])

print(research_titles)

In [29]:
# download necessary resources from NLTK
# nltk.download('stopwords')
# nltk.download('wordnet')

# define stop words and lemmatizer
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

# preprocess function for text data
def preprocess(text):
    # tokenize the text and remove stop words and punctuation
    tokens = [word for word in nltk.word_tokenize(text.lower()) if word not in stop and word not in exclude]
    
    # extract collocations and add them to the token list
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(tokens)
    finder.apply_freq_filter(2)  # only consider bigrams that occur at least twice
    collocations = finder.nbest(bigram_measures.pmi, 10)  # extract the top 10 collocations
    for collocation in collocations:
        if collocation[0] in tokens and collocation[1] in tokens:
            tokens.append('_'.join(collocation))
    
    # lemmatize the tokens
    normalized = " ".join(lemma.lemmatize(word) for word in tokens)
    return normalized

# preprocess the titles
titles_preprocessed = [preprocess(title) for title in research_titles]

# create a dictionary and corpus
dictionary = corpora.Dictionary([doc.split() for doc in titles_preprocessed])
corpus = [dictionary.doc2bow(doc.split()) for doc in titles_preprocessed]

# build the LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=dictionary,
                                            num_topics=20,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto', # higher value => more concentrated topics
                                            per_word_topics=True)

# print the top 10 topics and their most significant words
topics = lda_model.show_topics(num_topics=20, num_words=3, formatted=False)
for topic in topics:
    print("Topic {}: {}".format(topic[0], ", ".join([word[0] for word in topic[1]])))
    
# categorize the research titles based on their highest probability topic
# for i, title in enumerate(titles_preprocessed):
#     bow = dictionary.doc2bow(preprocess(title).split())
#     topic_probs = lda_model.get_document_topics(bow)
#     topic_probs_sorted = sorted(topic_probs, key=lambda x: x[1], reverse=True)
#     topic_num = topic_probs_sorted[0][0]
#     print("{} - Topic {}: {}".format(i+1, topic_num, ", ".join([word[0] for word in topics[topic_num][1]])))


Topic 0: detection, cnn, classification
Topic 1: district, improvement, behavior
Topic 2: learning, deep, algorithm
Topic 3: model, web, prediction
Topic 4: assessment, satellite, smart
Topic 5: local, status, glacier
Topic 6: nepal, kathmandu, valley
Topic 7: system, communication, time
Topic 8: new, data, road
Topic 9: recognition, plant, disease
Topic 10: building, design, climate
Topic 11: analysis, case, potential
Topic 12: performance, development, thermal
Topic 13: review, application, different
Topic 14: using, estimation, vehicle
Topic 15: network, impact, neural
Topic 16: energy, management, security
Topic 17: urban, towards, ’
Topic 18: study, challenge, perspective
Topic 19: based, approach, related
