In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os, sys, json
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

import llm_model as utils

In [3]:
data = pd.read_json("./json.json")
data = pd.DataFrame(data)

In [4]:
from langchain.embeddings.openai import OpenAIEmbeddings

embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")

embedded_documents = []
for code_content in data['code_content']:
    embedded_document = embed_model.embed_documents([code_content])
    embedded_documents.append(embedded_document)


In [5]:
import numpy as np
# Reshape job_embeddings to have two dimensions
embeddings_2d = np.reshape(embedded_documents, (len(embedded_documents), -1))

In [6]:
optimal_k = utils.silhouette_to_find_optimal_k(embeddings_2d)
optimal_k

8

In [7]:
cluster_labels, job_embeddings = utils.kmeans_clustering(optimal_k, embeddings_2d)
data['text_embedding_cluster'] = cluster_labels

In [8]:
# Create a dictionary to store the word counts per cluster and user
word_counts = {i: {} for i in range(optimal_k)}

# Iterate over the DataFrame rows
for index, row in data.iterrows():
    user_id = row['user_id']
    cluster_label = row['text_embedding_cluster']
    code_content = row['code_content']
    
    # Split the code content into words
    words = code_content.split()
    
    # Update the word counts for the cluster and user
    for word in words:
        if word in word_counts[cluster_label]:
            if user_id in word_counts[cluster_label][word]:
                word_counts[cluster_label][word][user_id] += 1
            else:
                word_counts[cluster_label][word][user_id] = 1
        else:
            word_counts[cluster_label][word] = {user_id: 1}
# Create a list to store the JSON items
json_items = []
# Print the word counts for each cluster and user
for cluster_label, words in word_counts.items():
    print(f"Cluster {cluster_label}:")
    for word, user_counts in words.items():
        # Create a dictionary for the JSON item
        json_item = {
            'Word': word,
            'Number of Users': len(user_counts),
            'User IDs and Counts': user_counts
        }
        
        # Append the JSON item to the list
        json_items.append(json_item)

# # Save the JSON items to a JSON file
# output_file = 'freq_counts.json'
# with open(output_file, 'w') as file:
#     json.dump(json_items, file)

Cluster 0:
Cluster 1:
Cluster 2:
Cluster 3:
Cluster 4:
Cluster 5:
Cluster 6:
Cluster 7:


In [9]:
from collections import Counter

# Assuming you have a list of embedded documents called 'embedded_documents'
word_frequencies = Counter()

# Iterate over the embedded documents
for embedded_document in embedded_documents:
    # Flatten the embedded document into a single list of words
    words = [word for sublist in embedded_document for word in sublist]
    
    # Update the word frequencies
    word_frequencies.update(words)

# Print the most common words and their frequencies
for word, frequency in word_frequencies.most_common():
    print(f"{word}: {frequency}")

-0.014675685789496197: 15
-0.015546969702451499: 12
0.01895042091582634: 9
-0.007950464075902669: 9
0.01838764454428501: 6
-0.013055642758610037: 6
-0.007160863204409904: 6
-0.008072988870927113: 6
0.010339687334331276: 6
-0.005489768330817317: 6
-0.013225814946749759: 6
0.02020289142428427: 6
-0.012470248792281704: 6
-0.030195427345798907: 6
0.030767207239052845: 6
-0.003614466319229662: 6
-0.019304381227825285: 6
0.03155680671356178: 6
-0.03340828712406754: 6
0.010891047049296176: 6
0.03294541609011855: 6
0.018283344373696747: 6
-0.0036485008034237336: 6
-0.004278139575921296: 6
0.024137282174334973: 6
0.015247465682750136: 6
-0.015968996654532202: 6
-0.03958895709055592: 6
0.016826668357058216: 6
-0.010203549397554988: 6
0.009121252008559334: 6
-0.005908393161613252: 6
0.014063064608341627: 6
-0.007684994773226014: 6
-0.010931888337196806: 6
0.00961134932601201: 6
-0.015737563000202813: 6
0.020461555087407556: 6
-0.023170701612504015: 6
0.02170041152279109: 6
-0.026982569430175376: 

In [10]:
# from sklearn.cluster import KMeans
# import numpy as np

# # Assuming you have the word frequencies stored in the 'word_frequencies' Counter object

# # Convert the word frequencies into a list of tuples
# word_frequency_tuples = list(word_frequencies.items())

# # Extract the words and frequencies separately
# words, frequencies = zip(*word_frequency_tuples)

# # Convert the frequencies into a numpy array
# X = np.array(frequencies).reshape(-1, 1)

# # Specify the desired number of clusters (K) for k-means
# #

# # Apply k-means clustering
# kmeans = KMeans(n_clusters=optimal_k)
# kmeans.fit(X)

# # Get the cluster labels
# cluster_labels = kmeans.labels_

# # Analyze the resulting clusters
# for cluster in range(optimal_k):
#     words_in_cluster = np.array(words)[cluster_labels == cluster]
#     words_in_cluster = [str(word) for word in words_in_cluster]  # Convert array elements to strings
#     print(f"Cluster {cluster}: {', '.join(words_in_cluster)}")


#     cluster_vectors = X[cluster_labels == cluster]  # Get the embedded vectors for the cluster
#      # Create a scatter plot for the cluster
#     plt.figure(figsize=(8, 8))
#     plt.scatter(cluster_vectors, cluster_vectors, c='blue', alpha=0.5)
#     plt.title(f"Cluster {cluster}")
#     plt.xlabel("Frequency")
#     plt.ylabel("Frequency")
#     plt.show()

# # Print the embedded words of each cluster
# for cluster in range(optimal_k):
#     cluster_words = np.array(words)[cluster_labels == cluster]
#     print(f"Words in Cluster {cluster}: {', '.join(cluster_words)}")

In [14]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import numpy as np

# Assuming you have the word frequencies stored in the 'word_frequencies' Counter object

# Convert the word frequencies into a list of tuples
word_frequency_tuples = list(word_frequencies.items())

# Extract the words and frequencies separately
words, frequencies = zip(*word_frequency_tuples)

# Convert the frequencies into a numpy array
X = np.array(frequencies).reshape(-1, 1)

# Specify the desired number of clusters (K) for k-means
# optimal_k = 5

# Apply k-means clustering
kmeans = KMeans(n_clusters=optimal_k)
kmeans.fit(X)

# Get the cluster labels
cluster_labels = kmeans.labels_

# Analyze the resulting clusters
for cluster in range(optimal_k):
    words_in_cluster = np.array(words)[cluster_labels == cluster]
    words_in_cluster = [str(word) for word in words_in_cluster]  # Convert array elements to strings
    #print(f"Cluster {cluster}: {', '.join(words_in_cluster)}")

    cluster_vectors = X[cluster_labels == cluster]  # Get the embedded vectors for the cluster
    
    # Create a scatter plot for the cluster
    # plt.figure(figsize=(8, 8))
    # plt.scatter(cluster_vectors, cluster_vectors, c='blue', alpha=0.5)
    # plt.title(f"Cluster {cluster}")
    # plt.xlabel("Frequency")
    # plt.ylabel("Frequency")
    # plt.show()

    # Print the embedded words of each cluster
    # Print the embedded words of each cluster
    cluster_words = np.array(words)[cluster_labels == cluster]
    cluster_words = [str(word) for word in cluster_words if isinstance(word, str)]  # Include only strings
    print(f"Words in Cluster {cluster}: {', '.join(cluster_words)}")

Words in Cluster 0: 
Words in Cluster 1: 
Words in Cluster 2: 
Words in Cluster 3: 
Words in Cluster 4: 
Words in Cluster 5: 
Words in Cluster 6: 
Words in Cluster 7: 


In [None]:
# Filter the word counts for the specified cluster
cluster_words = result_df[result_df['Cluster'] == cluster_to_visualize]['Word']
# Print the words associated with the cluster
print("Words in the cluster:")
for word in cluster_words:
    print(word)