##### Vectorizing Clusters

In [48]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize_tfidf(input_file_paths, output_file_paths, max_features=5000):
    for input_file_path, output_file_path in zip(input_file_paths, output_file_paths):
        # Load the tokenized data
        with open(input_file_path, 'r') as file:
            tokenized_data = json.load(file)
        
        # Assume tokenized_data is a list of strings
        # Initialize the TF-IDF Vectorizer
        vectorizer = TfidfVectorizer(max_features=max_features)

        # Vectorize the tokenized text
        tfidf_matrix = vectorizer.fit_transform(tokenized_data)

        # Convert to a dense matrix (optional, depending on what you need next)
        tfidf_dense = tfidf_matrix.todense()

        # Get feature names
        feature_names = vectorizer.get_feature_names_out()

        # Save as a DataFrame
        df_tfidf = pd.DataFrame(tfidf_dense, columns=feature_names)
        df_tfidf.to_csv(output_file_path, index=False)

        print(f"Vectorized cluster saved to {output_file_path}")

# Example usage
input_file_paths = ['cluster_0.json', 'cluster_1.json', 'cluster_2.json', 'cluster_3.json']
output_file_paths = ['cluster0_vectorized.csv', 'cluster1_vectorized.csv', 'cluster2_vectorized.csv', 'cluster3_vectorized.csv']
vectorize_tfidf(input_file_paths, output_file_paths)


Vectorized cluster saved to cluster0_vectorized.csv
Vectorized cluster saved to cluster1_vectorized.csv
Vectorized cluster saved to cluster2_vectorized.csv
Vectorized cluster saved to cluster3_vectorized.csv


##### Assigning Topics to Clusters 

In [1]:
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from scipy.sparse import csr_matrix

def display_topics_for_clusters(file_paths, num_top_words, num_topics_per_cluster=8):
    for cluster_idx, file_path in enumerate(file_paths):
        # Step 1: Load TF-IDF vectors
        tfidf_vectors = pd.read_csv(file_path)
        tfidf_matrix = csr_matrix(tfidf_vectors.values)

        lda_model = LatentDirichletAllocation(n_components=num_topics_per_cluster, random_state=42)
        lda_model.fit(tfidf_matrix)

        feature_names = tfidf_vectors.columns
        print(f"\nTopics for Cluster {cluster_idx}:")
        for topic_idx, topic in enumerate(lda_model.components_):
            print(f"Cluster {cluster_idx} - Topic {topic_idx}:")
            print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

# Example usage
file_paths = ['cluster0_vectorized.csv', 'cluster1_vectorized.csv', 'cluster2_vectorized.csv', 'cluster3_vectorized.csv']
num_top_words = 30
display_topics_for_clusters(file_paths, num_top_words)



Topics for Cluster 0:
Cluster 0 - Topic 0:
gemini launch openai deepmind bard took wednesday microsoft google division pro startup alphabet capable leap debate intensify upping behave peril released advanced collins stake reporter chatbot pichai sundar file board
Cluster 0 - Topic 1:
epic carlin apple store special game george app sector dudesy comedy kotak european comedian law competing mutual stunt core fund ipo rally seen growth rule satellite earnings account generated sweeney
Cluster 0 - Topic 2:
google company raghavan search language microsoft firm dealbook monday pure sport query technology arctos bridgewater public testing text fund openai team bard lamda product smaller executive chatbot chatgpt release chatbots
Cluster 0 - Topic 3:
google francis ai pope image detection risk llm blogpost policy schumer doronichev video content search comment tool society walker framework text chatgpt rule access openai detect people midjourney experience time
Cluster 0 - Topic 4:
reben med

##### Visualization of Topics

In [2]:
pip install pyLDAvis


Note: you may need to restart the kernel to use updated packages.


In [8]:
import pandas as pd

# Example data for illustration (replace with your actual topic data)
cluster_data = [
    ['Cluster 0 Topic 1', 'Cluster 0 Topic 2', 'Cluster 0 Topic 3', 'Cluster 0 Topic 4', 
     'Cluster 0 Topic 5', 'Cluster 0 Topic 6', 'Cluster 0 Topic 7', 'Cluster 0 Topic 8'],
    ['Cluster 1 Topic 1', 'Cluster 1 Topic 2', 'Cluster 1 Topic 3', 'Cluster 1 Topic 4', 
     'Cluster 1 Topic 5', 'Cluster 1 Topic 6', 'Cluster 1 Topic 7', 'Cluster 1 Topic 8'],
    ['Cluster 2 Topic 1', 'Cluster 2 Topic 2', 'Cluster 2 Topic 3', 'Cluster 2 Topic 4', 
     'Cluster 2 Topic 5', 'Cluster 2 Topic 6', 'Cluster 2 Topic 7', 'Cluster 2 Topic 8'],
    ['Cluster 3 Topic 1', 'Cluster 3 Topic 2', 'Cluster 3 Topic 3', 'Cluster 3 Topic 4', 
     'Cluster 3 Topic 5', 'Cluster 3 Topic 6', 'Cluster 3 Topic 7', 'Cluster 3 Topic 8']
]

# Creating the DataFrame
df_topics = pd.DataFrame(cluster_data, columns=[f"Topic {i+1}" for i in range(8)])

# Save as CSV
df_topics.to_csv('cluster_topics.csv', index=False)

# Display the DataFrame
print("DataFrame df_topics:")
print(df_topics)


DataFrame df_topics:
             Topic 1            Topic 2  ...            Topic 7            Topic 8
0  Cluster 0 Topic 1  Cluster 0 Topic 2  ...  Cluster 0 Topic 7  Cluster 0 Topic 8
1  Cluster 1 Topic 1  Cluster 1 Topic 2  ...  Cluster 1 Topic 7  Cluster 1 Topic 8
2  Cluster 2 Topic 1  Cluster 2 Topic 2  ...  Cluster 2 Topic 7  Cluster 2 Topic 8
3  Cluster 3 Topic 1  Cluster 3 Topic 2  ...  Cluster 3 Topic 7  Cluster 3 Topic 8

[4 rows x 8 columns]


##### Creating a Mapping_File 

In [50]:
import pandas as pd
import json

# Load the JSON file
with open('tokenizedFine_500.json', 'r') as file:
    stories = json.load(file)

# Create a unique story_ID for each story
story_ids = list(range(1, len(stories) + 1))

# Example: Assign clusters to each story
# For demonstration, we'll use a simple round-robin assignment
num_clusters = 4  # Change this to your actual number of clusters
clusters = [i % num_clusters for i in range(len(stories))]

# Create the DataFrame
mapping_df = pd.DataFrame({
    'story_ID': story_ids,
    'body_text': stories,
    'cluster': clusters
})

# Save the DataFrame to a CSV file
mapping_file_path = 'mapping_file.csv'
mapping_df.to_csv(mapping_file_path, index=False)

# Print the DataFrame to verify
print(mapping_df)


     story_ID                                          body_text  cluster
0           1  bengaluru infosys solid foundation grow back l...        0
1           2  december staff american writer artist institut...        1
2           3  bengaluru genz prioritizes professional growth...        2
3           4  new technology upend many online business comp...        3
4           5  share u tech giant buzzing fund scorching perf...        0
..        ...                                                ...      ...
495       496  google took next leap artificial intelligence ...        3
496       497  abstract startup pouring money generative tech...        0
497       498  dozen company popped offer service aimed ident...        1
498       499  abstract james mackenzie letter responds peggy...        2
499       500  new delhi transformation telecommunication com...        3

[500 rows x 3 columns]


##### Creating a DataFrame for Document Topics

In [51]:
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from scipy.sparse import csr_matrix

def dataframe_construct(file_paths, mapping_file_path, output_file_path):
    # Load the mapping file
    mapping_df = pd.read_csv(mapping_file_path)

    # Initialize the list to collect rows for the final DataFrame
    data = []

    # Loop through each cluster
    for cluster_idx, file_path in enumerate(file_paths):
        # Load the TF-IDF vectors
        tfidf_vectors = pd.read_csv(file_path)

        # Filter the mapping_df for the current cluster
        cluster_mapping = mapping_df[mapping_df['cluster'] == cluster_idx]

        # Initialize LDA model for the current cluster
        lda_model = LatentDirichletAllocation(n_components=10, random_state=42)

        # Exclude the document identifier column from the TF-IDF matrix
        tfidf_matrix = csr_matrix(tfidf_vectors.iloc[:, 1:].values)

        lda_model.fit(tfidf_matrix)

        # Get topic distributions for each document
        topic_distributions = lda_model.transform(tfidf_matrix)

        # Add document info and topic distribution to the data list
        for idx, row in cluster_mapping.iterrows():
            text_identifier = row['story_ID']  # Assuming 'story_ID' is the identifier
            row_data = {'text_identifier': text_identifier, 'cluster': cluster_idx}
            for topic_idx in range(8):
                column_name = f'topic_{cluster_idx}.{topic_idx}'
                # Check if idx is within the bounds of topic_distributions
                if idx < len(topic_distributions):
                    row_data[column_name] = topic_distributions[idx][topic_idx]  # Initialize with topic distribution score
                else:
                    row_data[column_name] = None  # Handle cases where idx exceeds topic_distributions size
            data.append(row_data)

    # Create the DataFrame with the collected data
    columns = ['text_identifier', 'cluster'] + [f'topic_{i}.{j}' for i in range(len(file_paths)) for j in range(8)]
    final_df = pd.DataFrame(data, columns=columns)

    # Save the DataFrame to a CSV file with NaNs replaced by '0.000000000000000000'
    final_df.to_csv(output_file_path, index=False, na_rep='0.000000000000000000')

    # Print the DataFrame to verify
    print(final_df.head(20))

# Example usage
file_paths = ['cluster0_vectorized.csv', 'cluster1_vectorized.csv', 'cluster2_vectorized.csv', 'cluster3_vectorized.csv']
mapping_file_path = 'mapping_file.csv'  # This file should contain "story_ID", "body_text", and "cluster"
output_file_path = 'df_topics.csv'

# Construct the DataFrame and create df_topics.csv
dataframe_construct(file_paths, mapping_file_path, output_file_path)


    text_identifier  cluster  topic_0.0  ...  topic_3.5  topic_3.6  topic_3.7
0                 1        0   0.005169  ...        NaN        NaN        NaN
1                 5        0   0.639780  ...        NaN        NaN        NaN
2                 9        0   0.006674  ...        NaN        NaN        NaN
3                13        0   0.004694  ...        NaN        NaN        NaN
4                17        0   0.007089  ...        NaN        NaN        NaN
5                21        0   0.006956  ...        NaN        NaN        NaN
6                25        0   0.012360  ...        NaN        NaN        NaN
7                29        0        NaN  ...        NaN        NaN        NaN
8                33        0        NaN  ...        NaN        NaN        NaN
9                37        0        NaN  ...        NaN        NaN        NaN
10               41        0        NaN  ...        NaN        NaN        NaN
11               45        0        NaN  ...        NaN        N