In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
#load the fine tuned model

loaded_model = BertForSequenceClassification.from_pretrained('savedmodels')
loaded_tokenizer = BertTokenizer.from_pretrained('savedmodels')


In [None]:
def requirement_to_embedding(model, tokenizer, requirement):
    input = tokenizer(requirement, padding=True, truncation=True, return_tensors="pt")
    input = input.to("cpu")  # copy input to CPU
    output = model(**input)  # run model without labels to get logits & encoded layers
    hidden_states = output.hidden_states
    embedding = hidden_states[12][0][0]  # each layer has output of size (batch_size, sequence_length, hidden_size); here we are getting the [CLS] token from the final layer
    embedding = embedding.detach().numpy()
    return embedding

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import numpy as np
embeddings = [ requirement_to_embedding ( model , tokenizer , requirement ) for
requirement in requirements ]
embedd_array = np . stack ( embeddings )
embedd_array . shape

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Specify the perplexity and learning rate values
perplexity_value = 50  # You can adjust this value
learning_rate_value = 10  # You can adjust this value

# Initialize the t-SNE model with specified perplexity and learning rate
tsne = TSNE(n_components=2, perplexity=perplexity_value, learning_rate=learning_rate_value, random_state=42)

# Fit the t-SNE model to your data
embeddings_2d = tsne.fit_transform(embedd_array)

# Create binary labels as a list (0 for 'requirement', 1 for 'standard')
labels = df.label.map({'standard': 1, 'requirement': 0}).tolist()

# Define colors and markers for the two classes
colors = sns.color_palette('Set1', n_colors=2)  # Custom color palette
markers = 'o'  # Circle markers for both classes

# Convert embeddings_2d array into a DataFrame with index values
df = pd.DataFrame(embeddings_2d, columns=['Dimension 1', 'Dimension 2'])
df['label'] = labels

# Create a scatter plot of the t-SNE embeddings for both classes
plt.figure(figsize=(10, 8))
sns.set(style='whitegrid')  # Set Seaborn style with gridlines

for label in set(labels):
    df_label = df[df['label'] == label]
    sns.scatterplot(data=df_label, x='Dimension 1', y='Dimension 2', hue='label', palette=[colors[int(label)]], marker=markers, edgecolor='k', s=100)
    


# Create legends for the classes
class_labels = ['requirement', 'standard']  # Map the labels back to their original names
legend_handles = [plt.Line2D([0], [0], marker=markers, color='w', label=class_labels[int(label)], markersize=10,
                              markerfacecolor=colors[int(label)]) for label in set(labels)]
plt.legend(handles=legend_handles, title='Classes')

plt.title("t-SNE Visualization of Embeddings with Class Labels and IDs", fontsize=16)
#plt.xlabel("t-SNE Dimension 1", fontsize=12)
#plt.ylabel("t-SNE Dimension 2", fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(False)  # Turn off gridlines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Save the plot as a PNG image
plt.savefig('tsne_visualization_perp6_nottrained.png', dpi=300, bbox_inches='tight')

plt.show()