In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

df = pd.read_csv('merged_newsapi_data.csv')

text_data = df['Lemmatized'].fillna('')

vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
doc_term_matrix = vectorizer.fit_transform(text_data)

num_topics = 5  
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(doc_term_matrix)

def get_top_words(model, feature_names, n_top_words):
    top_words = []
    for topic_idx, topic in enumerate(model.components_):
        top_words.append(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    return top_words

feature_names = vectorizer.get_feature_names_out()
top_words = get_top_words(lda, feature_names, 10)

topic_distributions = lda.transform(doc_term_matrix)
df['Dominant_Topic'] = np.argmax(topic_distributions, axis=1)
df['Topic_Probability'] = np.max(topic_distributions, axis=1)

topic_keywords = pd.Series(top_words, name='Topic_Keywords')
df = df.join(topic_keywords, on='Dominant_Topic')

df.to_csv('lda.csv', index=False)

print("Topic Keywords:")
for i, words in enumerate(top_words):
    print(f"Topic {i}: {words}")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def load_data():
    df = pd.read_csv('merged_newsapi_data.csv')
    text_data = df['Lemmatized'].fillna('')
    return df, text_data

def vectorize_text(text_data):
    vectorizer = CountVectorizer(
        max_df=0.95, 
        min_df=2,
        stop_words='english',
        ngram_range=(1, 2)
    )
    doc_term_matrix = vectorizer.fit_transform(text_data)
    return vectorizer, doc_term_matrix

def train_lda(doc_term_matrix, num_topics=5):
    lda = LatentDirichletAllocation(
        n_components=num_topics,
        learning_method='online',
        random_state=42,
        max_iter=10
    )
    lda.fit(doc_term_matrix)
    return lda

def visualize_topics(lda, vectorizer):
    feature_names = vectorizer.get_feature_names_out()
    
    plt.figure(figsize=(15, 10))
    for topic_idx, topic in enumerate(lda.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-15 - 1:-1]]
        word_freq = {word: topic[i] for i, word in enumerate(feature_names)}
        
        plt.subplot(2, 3, topic_idx+1)
        wordcloud = WordCloud(width=600, height=400, 
                            background_color='white').generate_from_frequencies(word_freq)
        plt.imshow(wordcloud)
        plt.title(f'Topic {topic_idx}')
        plt.axis('off')
    
    plt.tight_layout()
    plt.savefig('topic_wordclouds.png')
    plt.close()

def save_results(df, lda, doc_term_matrix):
    topic_distributions = lda.transform(doc_term_matrix)
    df['Dominant_Topic'] = np.argmax(topic_distributions, axis=1)
    df['Topic_Probability'] = np.max(topic_distributions, axis=1)
    
    feature_names = vectorizer.get_feature_names_out()
    top_words = []
    for topic_idx, topic in enumerate(lda.components_):
        top_words.append(", ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))
    
    df['Topic_Keywords'] = df['Dominant_Topic'].map(lambda x: top_words[x])
    df.to_csv('lda.csv', index=False)
    
    return df

if __name__ == "__main__":
    NUM_TOPICS = 5
    df, text_data = load_data()
    vectorizer, doc_term_matrix = vectorize_text(text_data)
    lda = train_lda(doc_term_matrix, NUM_TOPICS)
    visualize_topics(lda, vectorizer)
    final_df = save_results(df, lda, doc_term_matrix)
    
    topic_counts = final_df['Dominant_Topic'].value_counts()
    plt.figure(figsize=(10, 6))
    topic_counts.sort_index().plot(kind='bar')
    plt.title('Distribution of Topics Across Documents')
    plt.xlabel('Topic Number')
    plt.ylabel('Number of Documents')
    plt.xticks(rotation=0)
    plt.savefig('topic_distribution.png')
    plt.close()
    
    print("Analysis complete! Created:")
    print("- lda.csv (topic assignments)")
    print("- topic_wordclouds.png")
    print("- topic_distribution.png")

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def load_data():
    df = pd.read_csv('merged_newsapi_data.csv')
    text_data = df['Lemmatized'].fillna('')
    return df, text_data

def vectorize_text(text_data):
    vectorizer = CountVectorizer(
        max_df=0.95, 
        min_df=2,
        stop_words='english',
        ngram_range=(1, 1)  
    )
    doc_term_matrix = vectorizer.fit_transform(text_data)
    return vectorizer, doc_term_matrix

def train_lda(doc_term_matrix, num_topics=6):  
    lda = LatentDirichletAllocation(
        n_components=num_topics,
        learning_method='online',
        random_state=42,
        max_iter=20  
    )
    lda.fit(doc_term_matrix)
    return lda

def generate_topic_word_table(lda, vectorizer, top_n=12):
    feature_names = vectorizer.get_feature_names_out()
    topic_word_table = []
    
    for topic_idx, topic in enumerate(lda.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-top_n - 1:-1]]
        topic_word_table.append(top_words)
    
    df_table = pd.DataFrame(topic_word_table).T
    df_table.columns = [f"Topic {i}" for i in range(len(topic_word_table))]
    
    return df_table

if __name__ == "__main__":
    df, text_data = load_data()
    vectorizer, doc_term_matrix = vectorize_text(text_data)
    
    # Train LDA model
    lda = train_lda(doc_term_matrix)
    
    topic_table = generate_topic_word_table(lda, vectorizer)
    
    topic_table.to_csv('topic_word_table.csv', index=False)
    print("Topic-Word Distribution Table:")
    print(topic_table.to_markdown(index=False, tablefmt="grid"))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def load_data():
    df = pd.read_csv('merged_newsapi_data.csv')
    text_data = df['Lemmatized'].fillna('')
    return df, text_data

def vectorize_text(text_data):
    vectorizer = CountVectorizer(
        max_df=0.95, 
        min_df=2,
        stop_words='english',
        ngram_range=(1, 1)
    )
    return vectorizer, vectorizer.fit_transform(text_data)

def train_lda(doc_term_matrix, num_topics=6):
    lda = LatentDirichletAllocation(
        n_components=num_topics,
        learning_method='online',
        random_state=42,
        max_iter=20
    )
    return lda.fit(doc_term_matrix)

def generate_topic_table(lda, vectorizer, top_n=12):
    feature_names = vectorizer.get_feature_names_out()
    topic_words = []
    
    for topic in lda.components_:
        top_words_idx = topic.argsort()[:-top_n - 1:-1]
        topic_words.append([feature_names[i] for i in top_words_idx])
    
    # Create markdown table
    table = "| " + " | ".join([f"Topic {i}" for i in range(len(topic_words))]) + " |\n"
    table += "|" + "|".join(["---"] * len(topic_words)) + "|\n"
    
    for i in range(len(topic_words[0])):
        table += "| " + " | ".join([topic_words[t][i] for t in range(len(topic_words))]) + " |\n"
    
    with open('topic_word_table.txt', 'w') as f:
        f.write(table)
    
    return topic_words

def create_visualizations(lda, vectorizer, df):
    feature_names = vectorizer.get_feature_names_out()
    
    plt.figure(figsize=(20, 12))
    for topic_idx, topic in enumerate(lda.components_):
        word_freq = {feature_names[i]: topic[i] for i in topic.argsort()[:-20 - 1:-1]}
        
        plt.subplot(2, 3, topic_idx+1)
        wordcloud = WordCloud(width=800, height=600, 
                            background_color='white').generate_from_frequencies(word_freq)
        plt.imshow(wordcloud)
        plt.title(f'Topic {topic_idx}', fontsize=16)
        plt.axis('off')
    
    plt.tight_layout()
    plt.savefig('topic_wordclouds.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    topic_dist = np.argmax(lda.transform(doc_term_matrix), axis=1)
    plt.figure(figsize=(10, 6))
    pd.Series(topic_dist).value_counts().sort_index().plot(kind='bar')
    plt.title('Document Distribution Across Topics', fontsize=14)
    plt.xlabel('Topic Number', fontsize=12)
    plt.ylabel('Number of Documents', fontsize=12)
    plt.xticks(rotation=0)
    plt.savefig('topic_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()

if __name__ == "__main__":
    df, text_data = load_data()
    vectorizer, doc_term_matrix = vectorize_text(text_data)
    
    lda = train_lda(doc_term_matrix)
    
    topic_words = generate_topic_table(lda, vectorizer)
    create_visualizations(lda, vectorizer, df)
    
    pd.DataFrame(topic_words).T.to_csv(
        'topic_word_table.csv',
        header=[f"Topic {i}" for i in range(len(topic_words))],
        index=False
    )
    
    print("Successfully created:")
    print("- topic_word_table.txt (text format)")
    print("- topic_word_table.csv (spreadsheet format)")
    print("- topic_wordclouds.png (visualization)")
    print("- topic_distribution.png (chart)")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import MDS
from scipy.spatial.distance import pdist, squareform

def load_data():
    df = pd.read_csv('merged_newsapi_data.csv')
    text_data = df['Lemmatized'].fillna('')
    return df, text_data

def vectorize_text(text_data):
    vectorizer = CountVectorizer(
        max_df=0.95, 
        min_df=2,
        stop_words='english',
        ngram_range=(1, 2) 
    )
    return vectorizer, vectorizer.fit_transform(text_data)

def train_lda(doc_term_matrix, num_topics=6):
    lda = LatentDirichletAllocation(
        n_components=num_topics,
        learning_method='online',
        random_state=42,
        max_iter=20
    )
    return lda.fit(doc_term_matrix)

def generate_topic_table(lda, vectorizer, top_n=12):
    feature_names = vectorizer.get_feature_names_out()
    topic_words = []
    
    for topic in lda.components_:
        top_words_idx = topic.argsort()[:-top_n - 1:-1]
        topic_words.append([feature_names[i] for i in top_words_idx])
    
    with open('topic_word_table.txt', 'w') as f:
        f.write("| " + " | ".join([f"Topic {i}" for i in range(len(topic_words))]) + " |\n")
        f.write("|" + "|".join(["---"] * len(topic_words)) + "|\n")
        for i in range(len(topic_words[0])):
            f.write("| " + " | ".join([topic_words[t][i] for t in range(len(topic_words))]) + " |\n")
    
    return topic_words

def create_intertopic_map(lda):
    topic_dist = pdist(lda.components_, metric='cosine')
    dist_matrix = squareform(topic_dist)
    
    mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
    pos = mds.fit_transform(dist_matrix)
    
    plt.figure(figsize=(10, 8))
    for i in range(len(pos)):
        plt.scatter(pos[i, 0], pos[i, 1], s=200)
        plt.text(pos[i, 0]+0.02, pos[i, 1]+0.02, f'Topic {i}', 
                fontsize=12, ha='center', va='center')
    
    plt.title('Intertopic Distance Map (MDS)', fontsize=16)
    plt.xlabel('Dimension 1', fontsize=14)
    plt.ylabel('Dimension 2', fontsize=14)
    plt.grid(True, alpha=0.3)
    plt.savefig('intertopic_distance_map.png', dpi=300, bbox_inches='tight')
    plt.close()

def create_visualizations(lda, vectorizer):
    feature_names = vectorizer.get_feature_names_out()
    
    plt.figure(figsize=(20, 12))
    for topic_idx, topic in enumerate(lda.components_):
        word_freq = {feature_names[i]: topic[i] for i in topic.argsort()[:-20 - 1:-1]}
        
        plt.subplot(2, 3, topic_idx+1)
        wordcloud = WordCloud(width=800, height=600, 
                            background_color='white').generate_from_frequencies(word_freq)
        plt.imshow(wordcloud)
        plt.title(f'Topic {topic_idx}', fontsize=16)
        plt.axis('off')
    
    plt.tight_layout()
    plt.savefig('topic_wordclouds.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    plt.figure(figsize=(10, 6))
    pd.Series(np.argmax(lda.transform(doc_term_matrix), axis=1)).value_counts().sort_index().plot(kind='bar')
    plt.title('Document Distribution Across Topics', fontsize=14)
    plt.xlabel('Topic Number', fontsize=12)
    plt.ylabel('Number of Documents', fontsize=12)
    plt.xticks(rotation=0)
    plt.savefig('topic_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()

if __name__ == "__main__":
    df, text_data = load_data()
    vectorizer, doc_term_matrix = vectorize_text(text_data)
    
    lda = train_lda(doc_term_matrix)
    
    topic_words = generate_topic_table(lda, vectorizer)
    create_visualizations(lda, vectorizer)
    create_intertopic_map(lda)
    
    pd.DataFrame(topic_words).T.to_csv(
        'topic_word_table.csv',
        header=[f"Topic {i}" for i in range(len(topic_words))],
        index=False
    )
    
    print("Successfully created:")
    print("- topic_word_table.txt/.csv")
    print("- topic_wordclouds.png")
    print("- topic_distribution.png")
    print("- intertopic_distance_map.png")

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.read_csv('merged_newsapi_data.csv')

texts = df['Lemmatized'].fillna('')  

vectorizer = TfidfVectorizer(max_features=1000)  

tfidf_matrix = vectorizer.fit_transform(texts)

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

result_df = pd.concat([df, tfidf_df], axis=1)

result_df.to_csv('tfidf.csv', index=False)

print("TF-IDF transformation completed and saved to 'tfidf.csv'.")