In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
from setfit import SetFitModel
import os

file_path = "/content/shreeallahujesus.csv"
data = pd.read_csv(file_path)

texts = data["concatenated_text"].tolist()
labels = data["class_value"].tolist()

model = SetFitModel.from_pretrained("/content/setfit-finetuned-model")

embeddings = model.encode(texts) #generate embeddings for all text in dataset
output_dir = "./prototypes"
os.makedirs(output_dir, exist_ok=True)

def generate_text_prototypes(embeddings, labels, texts, n_clusters=5):
    """Generating prototypes for each class based on Clustering Algorithm"""
    prototype_texts = {}  # Store prototypes as texts

    for label in np.unique(labels):
        class_embeddings = embeddings[np.array(labels) == label]
        class_texts = np.array(texts)[np.array(labels) == label]
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        kmeans.fit(class_embeddings)

        # Find the closest text to each centroid
        closest_texts = []
        for centroid in kmeans.cluster_centers_:
            distances = euclidean_distances(class_embeddings, centroid.reshape(1, -1))
            closest_idx = np.argmin(distances)
            closest_texts.append(class_texts[closest_idx])

        prototype_texts[label] = closest_texts

        with open(f"{output_dir}/class_{label}_prototypes.txt", "w") as f:
            for idx, text in enumerate(closest_texts):
                f.write(f"Prototype {idx + 1}:\n{text}\n\n")

generate_text_prototypes(embeddings, labels, texts, n_clusters=5)


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from setfit import SetFitModel
label_to_class = {0: "not depressed", 1: "depressed"}

model = SetFitModel.from_pretrained("/content/setfit-finetuned-model")  # Replace with your model path

def load_prototypes_from_file(label):
    """Load prototypes for the given label from the corresponding text file."""
    file_path = f"./prototypes/class_{label}_prototypes.txt"
    with open(file_path, "r") as f:
        prototypes = [line.strip() for line in f if line.strip() and not line.startswith("Prototype")]
    return prototypes

def classify_and_explain(text):
    """Classify input text and explain the prediction using the closest prototype."""
    predicted_label = model.predict([text])[0]
    class_name = label_to_class[predicted_label]

    prototypes = load_prototypes_from_file(predicted_label)
    text_embedding = model.encode([text]).reshape(1, -1)

    similarities = [
        cosine_similarity(text_embedding, model.encode([proto]).reshape(1, -1))[0][0]
        for proto in prototypes
    ]

    closest_prototype_idx = np.argmax(similarities)
    closest_prototype = prototypes[closest_prototype_idx]
    closest_similarity = similarities[closest_prototype_idx]

    explanation = (
        f"The input text was classified as {class_name}. "
        f"It is most similar to the following prototype:\n\n"
        f"\"{closest_prototype}\"\n\n"
        f"Similarity Score: {closest_similarity:.2f}"
    )

    return predicted_label, closest_prototype, closest_similarity, explanation

input_text = ""
predicted_label, closest_prototype, similarity_score, explanation = classify_and_explain(input_text)

print(explanation)
