##### Vectorizing Clusters

In [6]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize_tfidf(input_file_paths, output_file_paths, max_features=5000):
    for input_file_path, output_file_path in zip(input_file_paths, output_file_paths):
        # Load the tokenized data
        with open(input_file_path, 'r') as file:
            tokenized_data = json.load(file)
        
        # Assume tokenized_data is a list of strings
        # Initialize the TF-IDF Vectorizer
        vectorizer = TfidfVectorizer(max_features=max_features)

        # Vectorize the tokenized text
        tfidf_matrix = vectorizer.fit_transform(tokenized_data)

        # Convert to a dense matrix (optional, depending on what you need next)
        tfidf_dense = tfidf_matrix.todense()

        # Get feature names
        feature_names = vectorizer.get_feature_names_out()

        # Save as a DataFrame
        df_tfidf = pd.DataFrame(tfidf_dense, columns=feature_names)
        df_tfidf.to_csv(output_file_path, index=False)

        print(f"Vectorized cluster saved to {output_file_path}")

# Example usage
input_file_paths = ['cluster_0.json', 'cluster_1.json', 'cluster_2.json', 'cluster_3.json']
output_file_paths = ['cluster0_vectorized.csv', 'cluster1_vectorized.csv', 'cluster2_vectorized.csv', 'cluster3_vectorized.csv']
vectorize_tfidf(input_file_paths, output_file_paths)


Vectorized cluster saved to cluster0_vectorized.csv
Vectorized cluster saved to cluster1_vectorized.csv
Vectorized cluster saved to cluster2_vectorized.csv
Vectorized cluster saved to cluster3_vectorized.csv


##### Assigning Topics to Clusters 

In [14]:
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from scipy.sparse import csr_matrix

def display_topics_for_clusters(file_paths, num_top_words, num_topics_per_cluster=8):
    for cluster_idx, file_path in enumerate(file_paths):
        # Step 1: Load TF-IDF vectors
        tfidf_vectors = pd.read_csv(file_path)
        tfidf_matrix = csr_matrix(tfidf_vectors.values)

        lda_model = LatentDirichletAllocation(n_components=num_topics_per_cluster, random_state=42)
        lda_model.fit(tfidf_matrix)

        feature_names = tfidf_vectors.columns
        print(f"\nTopics for Cluster {cluster_idx}:")
        for topic_idx, topic in enumerate(lda_model.components_):
            print(f"Cluster {cluster_idx} - Topic {topic_idx}:")
            print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

# Example usage
file_paths = ['cluster0_vectorized.csv', 'cluster1_vectorized.csv', 'cluster2_vectorized.csv', 'cluster3_vectorized.csv']
num_top_words = 20
display_topics_for_clusters(file_paths, num_top_words)



Topics for Cluster 0:
Cluster 0 - Topic 0:
highmark sydelabs wealth barron vulnerability illustrated suggests oracle kumari westfield vertex samtani gas clarke aaa holiday daughter drone arena syde
Cluster 0 - Topic 1:
agility hanooman shelton digit humanoid holding sml omega prasad cmu robotics seetha stealth hurst warehouse alumnus entity cassie massage bipedal
Cluster 0 - Topic 2:
chang tc tsmc murphy stanley morgan governor jersey recruitment pizzi grows adl extremism shareholder abortion audit volatility chandrasekaran wealth evil
Cluster 0 - Topic 3:
ai company technology new openai google chatgpt image india year tool generative artificial intelligence model one use microsoft time people
Cluster 0 - Topic 4:
japan desantis chandrasekhar responds noonan fcc robocalls concludes therefore exposed tom arpu telco telecom deepfakes farid hat sayeed thorn kishida
Cluster 0 - Topic 5:
drug peds athlete mit carpenter kellis xie doping cheat ped incurable molecular hurdle hunter fda alte

##### Visualization of Topics

In [None]:
pip uninstall pyLDAvis
