In [6]:
# Import libraries
import pickle
from gensim import corpora, models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

In [7]:
# Define the file path
pickle_file = 'processed_data.pkl'

In [8]:
# Read the processed data from the Pickle file
with open(pickle_file, 'rb') as file:
    processed_data = pickle.load(file)

In [9]:
# Retrieve data
title = processed_data['title']
abstract = processed_data['abstract']

In [11]:
title

0                               [reconstruct, effect, map]
1                  [rotation, invariance, neural, network]
2        [spherical, polyharmonics, poisson, kernels, p...
3        [finite, element, approximation, stochastic, m...
4        [comparative, study, discrete, wavelet, transf...
                               ...                        
20967    [contemporary, machine, learn, guide, practiti...
20968    [uniform, diamond, coat, hard, alloy, cut, ins...
20969         [analyse, soccer, game, cluster, conceptors]
20970    [efficient, simulation, sum, correlate, variates]
20971                 [optional, stop, problem, bayesians]
Name: title, Length: 20972, dtype: object

In [12]:
abstract

0        [predictive, model, allow, inference, analyze,...
1        [rotation, invariance, translation, invariance...
2        [introduce, develop, notion, spherical, polyha...
3        [stochastic, landau, lifshitz, gilbert, llg, e...
4        [ftir, spectra, sample, 7, plant, species, use...
                               ...                        
20967    [machine, learn, find, increasingly, broad, ap...
20968    [polycrystalline, diamond, coat, grow, cement,...
20969    [present, new, approach, identify, situations,...
20970    [sum, variates, encounter, many, challenge, ap...
20971    [recently, optional, stop, subject, debate, ba...
Name: abstract, Length: 20972, dtype: object

In [13]:
# Concatenate titles and abstracts
documents = [" ".join(t) + " " + " ".join(a) for t, a in zip(title, abstract)]

In [14]:
# Define the range of values for max_df and min_df to search over
param_grid = {
    'max_df': [0.7, 0.75, 0.8, 0.85, 0.9, 0.95],
    'min_df': [2, 3, 4, 5]
}

In [15]:
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [19]:
# Perform grid search with specified scoring metric
grid_search = GridSearchCV(estimator=tfidf_vectorizer, param_grid=param_grid, cv=5, scoring='neg_log_loss')
grid_search.fit(documents)

Traceback (most recent call last):
  File "C:\Users\Ang Soo Khee\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 811, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

 nan nan nan nan nan nan]


In [20]:
# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'max_df': 0.7, 'min_df': 2}


In [21]:
# Get the best estimator
best_estimator = grid_search.best_estimator_

In [22]:
# Transform the documents using the best estimator
tfidf_matrix = best_estimator.transform(documents)

In [None]:
# Apply Truncated SVD for LSA using the best parameters
lsa_model = TruncatedSVD(n_components=5)  # Define the number of topics
lsa_matrix = lsa_model.fit_transform(tfidf_matrix)

In [None]:


# Visualization and Interpretation
terms = grid_search.best_estimator_.get_feature_names_out()

for topic_id, topic in enumerate(lsa_model.components_):
    top_indices = topic.argsort()[-10:][::-1]  # Get the indices of the top 10 terms for the topic
    top_terms = [terms[i] for i in top_indices]  # Get the top 10 terms
    print(f"Topic {topic_id}: {', '.join(top_terms)}")

    # Word Cloud Visualization
    topic_words = dict(zip(top_terms, topic[top_indices]))
    wordcloud = WordCloud(background_color='white').generate_from_frequencies(topic_words)
    plt.figure(figsize=(8, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Topic {topic_id} Word Cloud')
    plt.axis('off')
    plt.show()

In [None]:
# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Apply Truncated SVD for LSA
num_topics = 5  # Define the number of topics
lsa_model = models.LsiModel(corpus=tfidf_matrix.T, id2word=None, num_topics=num_topics)

# Visualization
topics = lsa_model.show_topics(num_topics=-1, num_words=10)
for topic_id, topic in topics:
    print(f"Topic {topic_id}: {topic}")

    # Word Cloud Visualization
    wordcloud = WordCloud(background_color='white').generate_from_frequencies(dict(topic))
    plt.figure(figsize=(8, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Topic {topic_id} Word Cloud')
    plt.axis('off')
    plt.show()

In [None]:
# Topic Modeling Algorithm (LSA)
dictionary = corpora.Dictionary(df['preprocessed_abstract'])
corpus = [dictionary.doc2bow(text) for text in df['preprocessed_abstract']]
lsa_model = models.LsiModel(corpus, id2word=dictionary, num_topics=5)

In [None]:
# Visualization
topics = lsa_model.show_topics(num_topics=-1, num_words=10)
for topic_id, topic in topics:
    print(f"Topic {topic_id}: {topic}")

In [None]:
# Word Cloud Visualization
for topic_id, topic in topics:
    wordcloud = WordCloud(background_color='white').generate_from_frequencies(dict(topic))
    plt.figure(figsize=(8, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Topic {topic_id} Word Cloud')
    plt.axis('off')
    plt.show()

In [None]:
# Topic Interpretation
def get_document_topic(document):
    bow = dictionary.doc2bow(preprocess_text(document))
    topic_distribution = dict(lsa_model[bow])
    return max(topic_distribution, key=topic_distribution.get)

df['topic'] = df['ABSTRACT'].apply(get_document_topic)

In [None]:






# Testing and Validation
# You can validate the topics against the original content of the documents and assess coherence and interpretability manually.