In [1]:
WANDB_INTEGRATION = True
if WANDB_INTEGRATION:
    import wandb

    wandb.login()

[34m[1mwandb[0m: Network error (ConnectionError), entering retry loop.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score


In [3]:
# Load the dataset
data = pd.read_csv('Code Review.csv')

In [4]:
data.head()

Unnamed: 0,Code Review Title,File,Code Changes,Comment by Developer A,Comment by Developer B
0,Security Enhancement: Input Sanitization,security_utils.py,Implemented input sanitization for user inputs...,"""Good job on adding input sanitization. It's a...","""Agreed input sanitization is a must. We sho..."
1,Adding New Feature: User Profile Pics,profile_manager.py,Implemented user profile picture upload featur...,"""Great addition! We might want to validate ima...","""Nice work! I think we should also consider a..."
2,Bug Fix in Payment Processing,payment_processor.py,Fixed a bug causing payment failures. Lines Ch...,"""Good catch on fixing the payment bug. Let's m...","""Thanks for addressing this issue. I noticed ..."
3,Refactoring Database Queries,database_utils.py,Optimized database queries for faster performa...,"""Nice job on the query optimization. This shou...","""Agreed! This will definitely help with perfo..."
4,Enhancing Search Functionality,search_manager.py,Improved search algorithm for faster results. ...,"""Great optimization! This will significantly i...","""Nice work! Did you consider adding support f..."


In [5]:
# Combine comments by Developer A and B
data['Combined Comment'] = data['Comment by Developer A'] + ' ' + data['Comment by Developer B']

In [6]:
# Drop rows with missing or NaN values
data = data.dropna(subset=['Combined Comment'])

In [7]:
# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
import wandb
wandb.init()
wandb.run.name = wandb.run.id
wandb.run.save()
# Initialize wandb run
if WANDB_INTEGRATION:
    wandb.init(project="anomaly_detection_example", name="text_embeddings_clustering")

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

Problem at: <ipython-input-9-c57ffc608154> 2 <module>


CommError: Run initialization has timed out after 60.0 sec. 
Please refer to the documentation for additional information: https://docs.wandb.ai/guides/track/tracking-faq#initstarterror-error-communicating-with-wandb-process-

In [None]:
# Define a function to generate embeddings
def generate_embeddings(text):
    inputs = tokenizer.encode_plus(text, add_special_tokens=True, padding='max_length', max_length=128, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling over tokens
    return embeddings

In [None]:
# Create embeddings for comments
comment_embeddings = [generate_embeddings(comment) for comment in data['Combined Comment']]

In [None]:
from sklearn.decomposition import PCA
# Convert embeddings to numpy array for dimensionality reduction
embeddings_np = torch.stack(comment_embeddings).numpy()

# Perform PCA for dimensionality reduction
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings_np.reshape(embeddings_np.shape[0], -1))



In [None]:
# Perform clustering with 2 clusters on reduced embeddings
num_clusters = 2
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(reduced_embeddings)

In [None]:
# Log metrics to wandb
if WANDB_INTEGRATION:
    wandb.log({"Silhouette Score": silhouette_score(embeddings_tensor, cluster_labels)})
    wandb.log({"Davies-Bouldin Index": davies_bouldin_score(embeddings_tensor, cluster_labels)})


In [None]:
# Identify cluster indices for normal and anomaly samples
normal_cluster_index = cluster_labels[0]  # Assuming the first sample is normal
anomaly_cluster_index = 1 - normal_cluster_index

In [None]:
# Visualize clusters
plt.figure(figsize=(10, 6))
colors = ['r', 'b']
for i in range(num_clusters):
    cluster_mask = cluster_labels == i
    plt.scatter(embeddings_tensor[cluster_mask, 0], embeddings_tensor[cluster_mask, 1], c=colors[i], label=f'Cluster {i}')
plt.legend()
plt.title('Cluster Visualization')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.show()