In [None]:
# Import necessary libraries
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models

# Step 1: Read the text from the file
file_path = "/content/chap.txt"
with open(file_path, 'r') as file:
    text = file.read()

# Step 2: Text Preprocessing
# Tokenization : breaking down a text into smaller units, called tokens, which can be words, phrases
# it is essential for understanding structure and meaning of the text
#in NLP we know words act as features , tokenization enables the extraction of this feature
tokens = word_tokenize(text)

# Remove stopwords : The purpose of removing stopwords is to focus on the more informative words in a text
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word.lower() not in stop_words]


# Lemmatization 
# Lemmatization is a linguistic and natural language processing (NLP) technique that involves reducing words to
# their base or root form. 
# reduced set of features can improve model performance.
# Lemmatization reduces the vocabulary size by grouping inflected forms into a single lemma.
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in tokens]

# Step 3: Corpus Generation (assuming each document is a single line in the text file)
documents = [tokens]  # In case each line represents a document, otherwise, split the text into documents

# Create dictionary and corpus
# In linguistics and natural language processing (NLP), a corpus 
# refers to a collection of written or spoken texts that are used as a basis for linguistic analysis, language modeling, and various computational tasks. Corpora serve as large, structured data sources for studying language patterns, understanding linguistic phenomena, and building language models and algorithms.
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Step 4: Train LDA model

# Latent Dirichlet Allocation (LDA) is a probabilistic model and a 
# popular technique in natural language processing (NLP) and machine learning for topic modeling.
# LDA assumes that documents are mixtures of topics, and each topic is a distribution over words.
# A document is seen as a mixture of topics, where each word in the document is associated with a particular topic.

num_topics = 5
lda_model = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)

# Step 5: Save the trained model to a file
model_path = "/content/lda_model"
lda_model.save(model_path)
print("Trained LDA model saved successfully to:", model_path)




Trained LDA model saved successfully to: /content/lda_model


In [None]:
# Import necessary libraries
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
# the use of gensim : 
# Gensim is an open-source Python library designed for natural language processing (NLP) and topic modeling. 
# It is particularly known for its implementations of various algorithms for semantic analysis and document similarity analysis.
# Step 1: Read the text from the file
file_path = "/content/chap.txt"
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Step 2: Preprocess the Text
def preprocess_text(text):
    # Tokenize the text into words
    tokens = word_tokenize(text)
    
    # Remove stopwords and non-alphanumeric words, convert to lowercase
    stop_words = set(stopwords.words('english'))
    tokens = [word.lower() for word in tokens if word.isalnum() and word not in stop_words]
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

# Step 3: Create Dictionary and Corpus
def create_corpus(text):
    # Create a Gensim dictionary from the preprocessed text
    dictionary = corpora.Dictionary([text])
    
    # Create a Gensim corpus using the dictionary
    corpus = [dictionary.doc2bow(text)]
    
    return dictionary, corpus

# Step 4: Infer Topic Distribution
def infer_topic_distribution(lda_model, corpus):
    # Infer the topic distribution of the corpus using the pre-trained LDA model
    return lda_model.get_document_topics(corpus[0])

# Step 5: Assign Topics to Text
def assign_topics_to_text(topic_distribution, num_topics):
    # Sort the topics by probability and return the top 'num_topics' topics
    sorted_topics = sorted(topic_distribution, key=lambda x: x[1], reverse=True)
    return sorted_topics[:num_topics]

# Preprocess the text
processed_text = preprocess_text(text)

# Load pre-trained LDA model
lda_model = models.LdaModel.load('/content/lda_model')

# Create dictionary and corpus
dictionary, corpus = create_corpus(processed_text)

# Infer topic distribution

topic_distribution = infer_topic_distribution(lda_model, corpus)

# Assign topics to text
num_topics = 5  # Specify the number of topics to assign
assigned_topics = assign_topics_to_text(topic_distribution, num_topics)

# Print the assigned topics
print("Assigned Topics:", assigned_topics)


Assigned Topics: [(1, 0.96016765), (2, 0.036938675)]


In [None]:
# !pip install nltk
# import nltk
# # nltk.download('punkt')
# nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import pickle

with open('/content/lda_model', 'rb') as f:
    try:
        pickle.load(f)
    except Exception as e:
        print(f"Error loading the model: {e}")

Error loading the model: __randomstate_ctor() takes from 0 to 1 positional arguments but 2 were given


In [None]:
# Create dictionary and corpus
dictionary, corpus = create_corpus(processed_text)

# Infer topic distribution
topic_distribution = infer_topic_distribution(lda_model, corpus)

# Assign topics to text
assigned_topics = assign_topics_to_text(topic_distribution, num_topics)

print("Assigned Topics:", assigned_topics)

Assigned Topics: [(1, 0.9991922)]


In [None]:
# Create dictionary and corpus
dictionary, corpus = create_corpus(processed_text)

# Infer topic distribution
topic_distribution = infer_topic_distribution(lda_model, corpus)

# Assign topics to text
assigned_topics = assign_topics_to_text(topic_distribution, num_topics)

# Print assigned topics and original text
for i, (text, topics) in enumerate(zip(processed_text, assigned_topics)):
    print(f"Text {i+1}:")
    print("Original Text:", text)
    print("Assigned Topics:", topics)
    print()


Text 1:
Original Text: social
Assigned Topics: (1, 0.99960583)

