Generate a list of topics that will be used as reference for labeling and matching

In [1]:
import pandas as pd
import random
import csv
import re
import string

from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer, util

# Define initial topic list
initial_topics = [
    "Machine Learning, ML",
    "Deep Learning, DL",
    "Artificial Intelligence, AI",
    "Cybersecurity, Information Security",
    "Mobile App Development, Mobile Apps",
    "Web Development, Web Programming",
    "Internet of Things, IoT",
    "Cloud Computing",
    "Data Science, Data Analytics",
    "Big Data",
    "Blockchain, Distributed Ledger",
    "Augmented Reality, AR",
    "Virtual Reality, VR",
    "Natural Language Processing, NLP",
    "Computer Vision",
    "Robotics, Robotics Engineering",
    "Software Engineering, Software Development",
    "Networking, Computer Networks",
    "Database Management, DBMS",
    "Computer Security, Cybersecurity",
    "Game Development, Game Design",
    "Human-Computer Interaction, HCI",
    "E-commerce, Online Retail",
    "Social Media, Social Networking",
    "UI/UX Design",
    "Software Testing, Quality Assurance",
    "Chatbots, Conversational AI",
    "Recommender Systems",
    "Speech Recognition",
]

def clean_text(s, comma=False):
    s = s.lower()
    s = s.split()
    s = " ".join(s)
    # Add a comma if there is any '\' or '/' in the string
    if comma:
        s = re.sub(r'\\', ',', s)
        s = re.sub(r'/', ',', s)
    else:
        s = re.sub(f'[{re.escape(string.punctuation)}]', '', s)
    return s

def remove_stop_words(s):
    stop_words = set(stopwords.words('english'))
    s = s.split()
    s = [w for w in s if not w.lower() in stop_words]
    s = " ".join(s)
    return s

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read topics from the CSV file and append them to the topic list
with open("data\\supervisors_list.csv", mode="r", encoding="utf-8-sig") as csv_file:
    reader = csv.DictReader(csv_file)
    for row in reader:
        # Parse the list of topics if present
        for field in ["Expertise Area 1", "Expertise Area 2", "Expertise Area 3"]:
            if row.get(field, ""):
                try:
                    # Convert the string representation of a list into an actual list
                    topic_list = eval(row.get(field, "[]"))
                    if isinstance(topic_list, list):
                        # Exclude "N/A" and add valid topics to the list
                        initial_topics.extend([topic for topic in topic_list if topic and topic != "N/A"])
                except Exception as e:
                    print(f"Error parsing topics in field '{field}': {e}")

new_topics = []

# Read topics from the staff_proflies CSV file and append them to a new list
with open("data\\staff_profiles.csv", mode="r", encoding="utf-8-sig") as csv_file:
    reader = csv.DictReader(csv_file)
    for row in reader:
        # Parse the list of topics if present
        for field in ["research_interests", "teaching_areas", "courses_taught"]:
            if row.get(field, ""):
                try:
                    # Convert the string representation of a list into an actual list
                    topic_list = eval(row.get(field, "[]"))
                    if isinstance(topic_list, list):
                        # Exclude "N/A" and add valid topics to the list
                        new_topics.extend([topic for topic in topic_list if topic and topic != "N/A"])
                except Exception as e:
                    print(f"Error parsing topics in field '{field}': {e}")

topics = list(set(initial_topics))
new_topics = list(set(new_topics))

model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute embeddings for all topics
topic_embeddings = model.encode(topics, convert_to_tensor=True)
new_topics_embeddings = model.encode(new_topics, convert_to_tensor=True)

# Identify and remove similar topics
def find_similar_topics(embeddings, selected_topics, threshold=0.7):
    unique_topics = []
    for i, topic in enumerate(selected_topics):
        is_similar = False
        for j, unique_topic in enumerate(unique_topics):
            sim = util.cos_sim(embeddings[i], embeddings[j])
            if sim > threshold:
                is_similar = True
                break
        if not is_similar:
            unique_topics.append(topic)
    return unique_topics

# Create a loop where the threshold increments by 0.1 for each iteration
# In each loop, use find_similar_topics and save the results in a csv file with the threshold in the filename
thresholds = [0.5, 0.6, 0.7, 0.8, 0.9]
for threshold in thresholds:
    filtered_topics = find_similar_topics(topic_embeddings, topics, threshold)
    
    # Save the filtered topics to a CSV file
    filename = f"data\\filtered_topics_threshold_{threshold}.csv"
    with open(filename, mode="w", newline="", encoding="utf-8") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Topic"])  # Write the header
        for topic in filtered_topics:
            writer.writerow([clean_text(topic, comma=True)])  # Write each topic as a row

    filtered_new_topics = find_similar_topics(new_topics_embeddings, new_topics, threshold)
    filtered_combined_topics = list(set(filtered_topics + filtered_new_topics))
    filtered_combined_topics = find_similar_topics(model.encode(filtered_combined_topics, convert_to_tensor=True), filtered_combined_topics, threshold)

    # Save the combined filtered topics to a CSV file
    filename_combined = f"data\\filtered_combined_topics_threshold_{threshold}.csv"
    with open(filename_combined, mode="w", newline="", encoding="utf-8") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Topic"])
        for topic in filtered_combined_topics:
            writer.writerow([clean_text(topic, comma=True)])

    print(f"\nFiltered topics with threshold {threshold} have been saved")


Filtered topics with threshold 0.5 have been saved

Filtered topics with threshold 0.6 have been saved

Filtered topics with threshold 0.7 have been saved

Filtered topics with threshold 0.8 have been saved

Filtered topics with threshold 0.9 have been saved


Create fucntions to label students based on the list of topcis 

In [20]:
import spacy
from transformers import pipeline
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer, util


# Load models
sentiment_analyzer = pipeline("sentiment-analysis", device=0)
model = SentenceTransformer('all-MiniLM-L6-v2')
nlp = spacy.load("en_core_web_sm")
kw_model = KeyBERT(model)

def extract_candidate_topics(text, top_n=5):
    # KeyBERT keywords
    keywords = [kw[0] for kw in kw_model.extract_keywords(text, top_n=top_n, stop_words='english')]
    # spaCy noun chunks
    doc = nlp(text)
    noun_chunks = [chunk.text.strip() for chunk in doc.noun_chunks]
    # Combine and deduplicate
    candidates = list(set(keywords + noun_chunks))
    return candidates

def extract_topic_context(text, topic, window_size=10):
    """Extract the context around a topic mention for better sentiment analysis"""
    words = text.lower().split()
    topic_words = topic.lower().split()
    for i in range(len(words)):
        if all(tw in words[i:i+len(topic_words)] for tw in topic_words):
            start = max(0, i - window_size)
            end = min(len(words), i + len(topic_words) + window_size)
            return " ".join(words[start:end])
    return None

def calculate_similarities(student_text, topic_embs):
    """Embed student text and lecturer topics for similarity comparison"""
    clean_text_full = clean_text(student_text)
    student_emb = model.encode(clean_text_full, convert_to_tensor=True).cuda()
    # topic_embs = model.encode(lecturer_topics, convert_to_tensor=True).cuda()
    similarities = util.cos_sim(student_emb, topic_embs)[0].cpu().numpy()
    return similarities

def predict_match(strong_matches, candidate_matches, candidate_topics, student_text, lecturer_topics, threshold, sentiment_threshold, top_n=3):
    positive_topics = set()
    negative_topics = set()
    # Compute embeddings for all topics at once
    # student_emb = model.encode(clean_text_full, convert_to_tensor=True)
    # topic_embs = model.encode(lecturer_topics, convert_to_tensor=True)
    # similarities = util.cos_sim(student_emb, topic_embs)[0].cpu().numpy()
    # Get top-N most similar topics
    top_indices = strong_matches.argsort()[-top_n:][::-1]
    for idx in top_indices:
        sim = strong_matches[idx]
        if sim > threshold:
            topic = lecturer_topics[idx]
            topic_context = extract_topic_context(student_text, topic)
            if topic_context:
                sentiment = sentiment_analyzer(topic_context)[0]
                if sentiment["label"] == "NEGATIVE" and sentiment["score"] > sentiment_threshold:
                    negative_topics.add(topic)
                elif sentiment["label"] == "POSITIVE" and sentiment["score"] > sentiment_threshold:
                    positive_topics.add(topic)
                # If neutral or low confidence, skip
    # If no strong matches, try candidate topics
    if not positive_topics and not negative_topics:
        # candidate_embs = model.encode(candidate_topics, convert_to_tensor=True)
        # candidate_sims = util.cos_sim(student_emb, candidate_embs)[0].cpu().numpy()
        for i, sim in enumerate(candidate_matches):
            if sim > threshold:
                topic = candidate_topics[i]
                topic_context = extract_topic_context(student_text, topic)
                if topic_context:
                    sentiment = sentiment_analyzer(topic_context)[0]
                    if sentiment["label"] == "NEGATIVE" and sentiment["score"] > sentiment_threshold:
                        negative_topics.add(clean_text(topic))
                    elif sentiment["label"] == "POSITIVE" and sentiment["score"] > sentiment_threshold:
                        positive_topics.add(clean_text(topic))
    return {
        "positive_topics": list(positive_topics),
        "negative_topics": list(negative_topics)
    }

"""def predict_match(student_text, lecturer_topics, threshold=0.5):
    student_sentences = student_text.split(". ")  # Split student text into sentences
    positive_topics = []
    negative_topics = []

    for sentence in student_sentences:
        clean_sentence = clean_text(sentence)
        student_emb = model.encode(clean_sentence, convert_to_tensor=True)
        topic_embs = model.encode(lecturer_topics, convert_to_tensor=True)
        
        # Compute similarities between the sentence and lecturer topics
        similarities = util.cos_sim(student_emb, topic_embs)
        max_sim = float(similarities.max())  # Get the maximum similarity score

        # Check if there is any similarity in existing topics
        # If not, extract a new topic
        # and classify it as positive or negative
        if max_sim < threshold:
            new_topics = extract_candidate_topics(clean_sentence)
            for kw in new_topics:
                kw_emb = model.encode(kw, convert_to_tensor=True)
                similarities = util.cos_sim(kw_emb, topic_embs)
                max_sim_kw = float(similarities.max())  # Get the maximum similarity score
                if max_sim_kw < threshold:
                    lecturer_topics.append(kw)
                    # print(f"New topic added: {kw}")
                    # print(f"Sentence: {sentence}")
                else:
                    topic = lecturer_topics[similarities.argmax()]
                    sentiment = sentiment_analyzer(clean_sentence)[0]
                    if sentiment["label"] == "NEGATIVE":
                        negative_topics.append(clean_text(topic))
                    else:
                        positive_topics.append(clean_text(topic))
            # sentiment = sentiment_analyzer(clean_sentence)[0]
            # if sentiment["label"] == "NEGATIVE":
            #     negative_topics.append(new_topic)
            # else:
            #     positive_topics.append(new_topic)
            # continue

        # Match all relevant topics in the sentence
        for idx, sim in enumerate(similarities[0]):
            if sim > threshold:
                topic = lecturer_topics[idx]

                # Use sentiment analysis to classify the sentence
                sentiment = sentiment_analyzer(clean_sentence)[0]
                if sentiment["label"] == "NEGATIVE":
                    negative_topics.append(clean_text(topic))
                else:
                    positive_topics.append(clean_text(topic))

    return {
        "positive_topics": list(set(positive_topics)),  # Remove duplicates
        "negative_topics": list(set(negative_topics))   # Remove duplicates
    }
    """

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


'def predict_match(student_text, lecturer_topics, threshold=0.5):\n    student_sentences = student_text.split(". ")  # Split student text into sentences\n    positive_topics = []\n    negative_topics = []\n\n    for sentence in student_sentences:\n        clean_sentence = clean_text(sentence)\n        student_emb = model.encode(clean_sentence, convert_to_tensor=True)\n        topic_embs = model.encode(lecturer_topics, convert_to_tensor=True)\n\n        # Compute similarities between the sentence and lecturer topics\n        similarities = util.cos_sim(student_emb, topic_embs)\n        max_sim = float(similarities.max())  # Get the maximum similarity score\n\n        # Check if there is any similarity in existing topics\n        # If not, extract a new topic\n        # and classify it as positive or negative\n        if max_sim < threshold:\n            new_topics = extract_candidate_topics(clean_sentence)\n            for kw in new_topics:\n                kw_emb = model.encode(kw, con

Utilise LLM generated sentences to test and determine accuracy of model, hyper-tuning begins here.

In [24]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score

# List of CSV files to process
sentence_files = [
    # "data/cgpt_sentences.csv",
    "data/claude_sentences.csv",
    # "data/deepseek_sentences.csv",
    # "data/gemini_sentences.csv",
    # "data/linkedin_sentences.csv"
]
topic_files = [ 
    "data/filtered_combined_topics_threshold_0.5.csv",
    "data/filtered_combined_topics_threshold_0.6.csv",
    "data/filtered_combined_topics_threshold_0.7.csv",
    "data/filtered_combined_topics_threshold_0.8.csv",
    "data/filtered_combined_topics_threshold_0.9.csv",
    "data/filtered_topics_threshold_0.5.csv",
    "data/filtered_topics_threshold_0.6.csv",
    "data/filtered_topics_threshold_0.7.csv",
    "data/filtered_topics_threshold_0.8.csv",
    "data/filtered_topics_threshold_0.9.csv",
]

# Threshold for topic matching
thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]
sentiment_thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]

# Lecturer topics
lecturer_topics = open("data/filtered_topics.csv", mode="r", encoding="utf-8").read().splitlines()[1:]  # Skip header

def evaluate_predictions(true_positive_topics, true_negative_topics, 
                        predicted_positive_topics, predicted_negative_topics,
                        true_topic_emb_dict):
    true_labels = []
    predicted_labels = []
    topic_coverage = []
    irrelevant_predictions = []
    
    for i in range(len(true_positive_topics)):
        row_predictions = set()
        total_true_topics = len(true_positive_topics[i]) + len(true_negative_topics[i])
        matched_topics = 0
        
        # Evaluate positive topics
        for topic in true_positive_topics[i]:
            true_labels.append(1)
            found = False
            true_topic_emb = true_topic_emb_dict.get(topic.strip())
            for pred_topic in predicted_positive_topics[i]:
                pred_topic_emb = model.encode(pred_topic, convert_to_tensor=True)
                if true_topic_emb is not None:
                    similarity = util.cos_sim(pred_topic_emb, true_topic_emb)
                    if similarity > 0.8:
                        predicted_labels.append(1)
                        found = True
                        row_predictions.add(pred_topic)
                        matched_topics += 1
                        break
            if not found:
                predicted_labels.append(0)

        # Evaluate negative topics
        for topic in true_negative_topics[i]:
            true_labels.append(0)
            found = False
            true_topic_emb = true_topic_emb_dict.get(topic.strip())
            for pred_topic in predicted_negative_topics[i]:
                pred_topic_emb = model.encode(pred_topic, convert_to_tensor=True)
                if true_topic_emb is not None:
                    similarity = util.cos_sim(pred_topic_emb, true_topic_emb)
                    if similarity > 0.8:
                        predicted_labels.append(0)
                        found = True
                        row_predictions.add(pred_topic)
                        matched_topics += 1
                        break
            if not found:
                predicted_labels.append(1)
        
        # Calculate topic coverage for this row
        topic_coverage.append(matched_topics / total_true_topics if total_true_topics > 0 else 1.0)
        
        # Count irrelevant predictions for this row
        current_predictions = predicted_positive_topics[i] + predicted_negative_topics[i]
        irrelevant_count = 0
        for pred_topic in current_predictions:
            if pred_topic not in row_predictions:
                true_labels.append(0)
                predicted_labels.append(1)
                irrelevant_count += 1
        
        irrelevant_predictions.append(irrelevant_count)

    # Calculate metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1_pos = f1_score(true_labels, predicted_labels, pos_label=1)
    f1_neg = f1_score(true_labels, predicted_labels, pos_label=0)
    precision_pos = precision_score(true_labels, predicted_labels, pos_label=1)
    precision_neg = precision_score(true_labels, predicted_labels, pos_label=0)
    recall_pos = recall_score(true_labels, predicted_labels, pos_label=1)
    recall_neg = recall_score(true_labels, predicted_labels, pos_label=0)
    avg_topic_coverage = sum(topic_coverage) / len(topic_coverage)
    avg_irrelevant_rate = sum(irrelevant_predictions) / len(irrelevant_predictions)

    metrics = {
        'accuracy': accuracy,
        'f1_positive': f1_pos,
        'f1_negative': f1_neg,
        'precision_positive': precision_pos,
        'precision_negative': precision_neg,
        'recall_positive': recall_pos,
        'recall_negative': recall_neg,
        'topic_coverage': avg_topic_coverage,
        'irrelevant_prediction_rate': avg_irrelevant_rate,
        'true_labels': true_labels,
        'predicted_labels': predicted_labels
    }

    return metrics

# Function to evaluate accuracy and generate classification report
def evaluate_predict_match(file_path, lecturer_topics, thresholds, sentiment_thresholds):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Ensure the CSV has the required columns
    if "sentence" not in df.columns or "positive_topic" not in df.columns or "negative_topic" not in df.columns:
        print(f"Error: {file_path} does not contain the required columns.")
        return

    # Extract true labels for positive and negative topics
    true_positive_topics = df["positive_topic"].apply(lambda x: x.split(",") if pd.notna(x) else [])
    true_negative_topics = df["negative_topic"].apply(lambda x: x.split(",") if pd.notna(x) else [])

    # Initialize nested dictionaries
    predicted_positive_topics = {st: {t: [] for t in thresholds} for st in sentiment_thresholds}
    predicted_negative_topics = {st: {t: [] for t in thresholds} for st in sentiment_thresholds}
    lecturer_topics_embeddings = model.encode(open(lecturer_topics, mode="r", encoding="utf-8").read().splitlines(), convert_to_tensor=True).cuda()

    for _, row in df.iterrows():
        student_text = row["sentence"]
        strong_matches = calculate_similarities(student_text, lecturer_topics_embeddings)
        candidate_topics = extract_candidate_topics(student_text, top_n=5)
        candidate_topics_embeddings = model.encode(candidate_topics, convert_to_tensor=True).cuda()
        candidate_matches = calculate_similarities(student_text, candidate_topics_embeddings)

        for sentiment_threshold in sentiment_thresholds:
            for threshold in thresholds:
                # Use the predict_match function to get positive and negative topics
                result = predict_match(strong_matches, candidate_matches, candidate_topics, student_text, open(lecturer_topics, mode="r", encoding="utf-8").read().splitlines(), threshold, sentiment_threshold)
                # Append predicted topics
                predicted_positive_topics[sentiment_threshold][threshold].append(result["positive_topics"])
                predicted_negative_topics[sentiment_threshold][threshold].append(result["negative_topics"])

    # Extract all unique true topics
    all_true_topics = set()
    for topics in true_positive_topics:
        all_true_topics.update(topics)
    for topics in true_negative_topics:
        all_true_topics.update(topics)
    all_true_topics = {t.strip() for t in all_true_topics if t and t.strip()}

    # Compute embeddings for all unique true topics
    true_topic_emb_dict = {}
    if all_true_topics:
        topic_list = list(all_true_topics)
        topic_embs = model.encode(topic_list, convert_to_tensor=True)
        for idx, topic in enumerate(topic_list):
            true_topic_emb_dict[topic] = topic_embs[idx]

    for sentiment_threshold in sentiment_thresholds:
            for threshold in thresholds:
                metrics = evaluate_predictions(
                        true_positive_topics.tolist(),
                        true_negative_topics.tolist(),
                        predicted_positive_topics[sentiment_threshold][threshold],
                        predicted_negative_topics[sentiment_threshold][threshold],
                        true_topic_emb_dict
                    )

                create_summary_table(file_path, lecturer_topics, threshold, sentiment_threshold, metrics)

                report = classification_report(
                    metrics['true_labels'], 
                    metrics['predicted_labels'],
                    target_names=["Negative Interest", "Positive Interest"]
                )

                # Print results
                print(f"Results for {file_path} and {lecturer_topics} at threshold ({threshold}, {sentiment_threshold}):")
                print(f"Accuracy: {metrics['accuracy']:.3f}")
                print("Classification Report:")
                print(report)
                print("-" * 50)

                # Store report in a text file - using append mode
                report_filename = f"results/report_{file_path.split('/')[-1].split('.')[0]}.txt"
                
                report_header = f"\n{'='*50}\n"
                report_header += f"Results for lecturer topics: {lecturer_topics}\n"
                report_header += f"Threshold: {threshold}\n"
                report_header += f"{'='*50}\n"
                
                with open(report_filename, mode="a", encoding="utf-8") as report_file:
                    report_file.write(report_header)
                    report_file.write(f"Accuracy: {metrics['accuracy']:.3f}\n")
                    report_file.write("Classification Report:\n")
                    report_file.write(report)
                    report_file.write(f"\n{'-'*50}\n")

                # Store results in a CSV file
                results_df = pd.DataFrame({
                    "sentence": df["sentence"],
                    "predicted_positive_topics": predicted_positive_topics[sentiment_threshold][threshold],
                    "predicted_negative_topics": predicted_negative_topics[sentiment_threshold][threshold],
                    "true_positive_topics": true_positive_topics,
                    "true_negative_topics": true_negative_topics
                })
                results_df.to_csv(f"results/results_{file_path.split('/')[-1].split('.')[0]}_{lecturer_topics}_threshold_{threshold}_{sentiment_threshold}.csv", index=False)

                # Print the predicted topics for each sentence
                # Print the true topics for each sentence
                # for i, row in df.iterrows():
                #     student_text = row["sentence"]
                #     print(f"Sentence: {student_text}")
                #     print(f"Predicted Positive Topics: {predicted_positive_topics[i]}")
                #     print(f"Predicted Negative Topics: {predicted_negative_topics[i]}")
                #     print(f"True Positive Topics: {true_positive_topics[i]}")
                #     print(f"True Negative Topics: {true_negative_topics[i]}")
                #     print("-" * 50)

def create_summary_table(file_path, lecturer_topics, threshold, sentiment_threshold, metrics):
    summary_file = f"results/summary_{file_path.split('/')[-1].split('.')[0]}.csv"
    
    new_row = pd.DataFrame({
        'Lecture Topics': [lecturer_topics],
        'Threshold': [threshold],
        'Sentiment Threshold': [sentiment_threshold],
        'Accuracy': [f"{metrics['accuracy']:.3f}"],
        'F1_Positive': [f"{metrics['f1_positive']:.3f}"],
        'F1_Negative': [f"{metrics['f1_negative']:.3f}"],
        'Precision_Positive': [f"{metrics['precision_positive']:.3f}"],
        'Precision_Negative': [f"{metrics['precision_negative']:.3f}"],
        'Recall_Positive': [f"{metrics['recall_positive']:.3f}"],
        'Recall_Negative': [f"{metrics['recall_negative']:.3f}"],
        'Topic_Coverage': [f"{metrics['topic_coverage']:.3f}"],
        'Irrelevant_Rate': [f"{metrics['irrelevant_prediction_rate']:.3f}"]
    })
    
    try:
        summary_df = pd.read_csv(summary_file)
        summary_df = pd.concat([summary_df, new_row], ignore_index=True)
    except FileNotFoundError:
        summary_df = new_row
    
    summary_df.to_csv(summary_file, index=False)

# Process each CSV file

for lecturer_topics in topic_files:
    for sentence_file in sentence_files:
        evaluate_predict_match(sentence_file, lecturer_topics, thresholds, sentiment_thresholds)
                

Results for data/claude_sentences.csv and data/filtered_combined_topics_threshold_0.5.csv at threshold (0.3, 0.3):
Accuracy: 0.218
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.22      0.07      0.11      1004
Positive Interest       0.22      0.49      0.30       525

         accuracy                           0.22      1529
        macro avg       0.22      0.28      0.21      1529
     weighted avg       0.22      0.22      0.18      1529

--------------------------------------------------
Results for data/claude_sentences.csv and data/filtered_combined_topics_threshold_0.5.csv at threshold (0.4, 0.3):
Accuracy: 0.232
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.12      0.06      0.08       678
Positive Interest       0.27      0.46      0.34       525

         accuracy                           0.23      1203
        macro avg       0.20      0.26      0.