In [22]:
# For the implementation of topic modelling by using the gensim and LDA library for deciding the major important  topics and get the details

import os
from gensim import corpora, models
from gensim.models import TfidfModel
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

# Directory containing your text files
#directory = '/path/to/text/files'
directory = 'C:/Users/singh/OneDrive/Desktop/NLP Project/NLPproject1/all/h14'

# Read and preprocess data
texts = []
index_names = []
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        with open(os.path.join(directory, filename), 'r') as file:
            content = file.read().lower()
            # Remove punctuation and numbers
            translator = str.maketrans('', '', string.punctuation + string.digits)
            content = content.translate(translator)
            # Tokenization and removing stopwords
            stop = set(stopwords.words('english'))
            tokens = [word for word in content.split() if word not in stop]
            # Lemmatization
            lemma = WordNetLemmatizer()
            tokens = [lemma.lemmatize(word) for word in tokens]
            texts.append(tokens)
            index_names.append(filename.replace('.txt', ''))

# Create dictionary and corpus for topic modeling
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Create TF-IDF model based on the corpus
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

# Perform LDA using the TF-IDF corpus
num_topics = 20  # Adjust the number of topics as needed
lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary, passes=20, random_state=42)

# Extract top words for each topic
top_words_per_topic = []
for t in range(num_topics):
    top_words = lda.show_topic(t, topn=2)  # Get top 3 words for each topic
    top_words_per_topic.append(", ".join([word for word, _ in top_words]))

# Collect and save the results
results = []
for i, bow in enumerate(corpus):
    topic_probs = lda.get_document_topics(bow, minimum_probability=0)
    # Sort topics by probability
    sorted_topics = sorted(topic_probs, key=lambda x: x[1], reverse=True)
    # Pick the top 3 topics
    top_topics = sorted_topics[:3]
    #top_topic_words = [f"Topic {t[0]+1}: " + top_words_per_topic[t[0]] for t in top_topics]
    top_topic_words = [f"Topic {t[0]+1}: " + top_words_per_topic[t[0]] + ";" for t in top_topics]

    combined_topics = "\n".join(top_topic_words)
    results.append([index_names[i], combined_topics])

# Column labels for CSV
columns = ['Index', 'Top Topics']
df = pd.DataFrame(results, columns=columns)
#df.to_csv('/path/to/output/topics_distribution_combined.csv', index=False)
df.to_csv('C:/Users/singh/OneDrive/Desktop/NLP Project/NLPproject1/all/h14/h14.csv'
, index=False)
# Optionally print topics
topics = lda.print_topics(num_words=4)
for topic in topics:
    print(topic)


(0, '0.001*"dopamine" + 0.001*"kaffine" + 0.001*"fat" + 0.000*"muscle"')
(1, '0.000*"rem" + 0.000*"visualization" + 0.000*"apo" + 0.000*"polar"')
(2, '0.000*"aggression" + 0.000*"aggressive" + 0.000*"mounting" + 0.000*"mating"')
(3, '0.000*"headache" + 0.000*"cooling" + 0.000*"fermented" + 0.000*"microbiota"')
(4, '0.000*"dopamine" + 0.000*"carbon" + 0.000*"psychodelic" + 0.000*"apo"')
(5, '0.000*"nicotine" + 0.000*"vaping" + 0.000*"smoking" + 0.000*"adaptation"')
(6, '0.000*"salt" + 0.000*"salty" + 0.000*"noise" + 0.000*"ear"')
(7, '0.000*"sperm" + 0.000*"trough" + 0.000*"fertilization" + 0.000*"sandwich"')
(8, '0.000*"adaptogens" + 0.000*"premium" + 0.000*"chaga" + 0.000*"adaptagens"')
(9, '0.000*"lens" + 0.000*"myopia" + 0.000*"premium" + 0.000*"eyesight"')
(10, '0.000*"coling" + 0.000*"error" + 0.000*"visual" + 0.000*"anerexia"')
(11, '0.000*"carbon" + 0.000*"smell" + 0.000*"oxygen" + 0.000*"inhale"')
(12, '0.000*"sylicibon" + 0.000*"sylisibon" + 0.000*"journey" + 0.000*"mushroom"'