Generate a list of topics that will be used as reference for labeling and matching

In [13]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import random
from sklearn.metrics import accuracy_score, classification_report
import csv

# Define initial topic list
initial_topics = [
    "Machine Learning", "Deep Learning", "Artificial Intelligence", "Cybersecurity",
    "Mobile App Development", "Web Development", "IoT", "Internet of Things",
    "Blockchain", "Data Science", "Big Data", "Natural Language Processing", "NLP",
    "Robotics", "Computer Vision", "Cloud Computing", "Edge Computing", "Quantum Computing",
    "Augmented Reality", "Virtual Reality", "Game Development", "Software Engineering",
    "Embedded Systems", "Autonomous Vehicles", "Digital Marketing", "E-Commerce",
    "Social Media Analytics", "Financial Technology", "Cryptography", "Network Security",
    "Ethical Hacking", "DevOps", "Agile Methodologies", "Computer Networks",
    "Distributed Systems", "High-Performance Computing", "Data Visualization",
    "Information Retrieval", "Multimedia Processing", "Cyber-Physical Systems",
    "Explainable AI", "Federated Learning", "Reinforcement Learning", "Predictive Analytics",
    "Knowledge Graphs", "Semantic Web", "Ontology Engineering"
]

In [None]:
# Read topics from the CSV file and append them to the topic list
with open("data\\staff_profiles.csv", mode="r", encoding="utf-8") as csv_file:
    reader = csv.DictReader(csv_file)
    for row in reader:
        # Parse the list of topics if present
        for field in ["research_interests", "teaching_areas", "courses_taught"]:
            if row.get(field, ""):
                try:
                    # Convert the string representation of a list into an actual list
                    topic_list = eval(row.get(field, "[]"))
                    if isinstance(topic_list, list):
                        # Exclude "N/A" and add valid topics to the list
                        initial_topics.extend([topic for topic in topic_list if topic and topic != "N/A"])
                except Exception as e:
                    print(f"Error parsing topics in field '{field}': {e}")

topics = list(set(initial_topics))

model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute embeddings for all topics
topic_embeddings = model.encode(topics, convert_to_tensor=True)

# Identify and remove similar topics
threshold = 0.7
unique_topics = []
for i, topic in enumerate(topics):
    is_similar = False
    for j, unique_topic in enumerate(unique_topics):
        sim = util.cos_sim(topic_embeddings[i], model.encode(unique_topic, convert_to_tensor=True))
        if sim > threshold:
            is_similar = True
            break
    if not is_similar:
        unique_topics.append(topic)

# Replace the original topics list with unique topics
topics = unique_topics

# Print the filtered topics
print("Filtered Topics:")
print(topics)

# Save the filtered topics to a CSV file
with open("data\\filtered_topics.csv", mode="w", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["Topic"])  # Write the header
    for topic in topics:
        writer.writerow([topic])  # Write each topic as a row

print("\nFiltered topics have been saved to 'data\\filtered_topics.csv'.")

Create a function to label students based on the list of topcis

In [14]:
from transformers import pipeline

# Load sentiment analysis model
sentiment_analyzer = pipeline("sentiment-analysis")

model = SentenceTransformer('all-mpnet-base-v2')

def predict_match(student_text, lecturer_topics, threshold=0.5):
    student_sentences = student_text.split(". ")  # Split student text into sentences
    positive_topics = []
    negative_topics = []

    # Expand negative keywords
    negative_keywords = ["not interested", "dislike", "hate", "avoid", "do not like", "no interest", "don't want", "not want", "uninterested"]

    for sentence in student_sentences:
        student_emb = model.encode(sentence, convert_to_tensor=True)
        topic_embs = model.encode(lecturer_topics, convert_to_tensor=True)
        
        # Compute similarities between the sentence and lecturer topics
        similarities = util.cos_sim(student_emb, topic_embs)
        max_sim_index = int(similarities.argmax())  # Get the index of the most similar topic
        max_sim = float(similarities.max())  # Get the maximum similarity score

        # Skip ambiguous or neutral sentences
        if max_sim < threshold:
            continue

        # Match all relevant topics in the sentence
        for idx, sim in enumerate(similarities[0]):
            if sim > threshold:
                topic = lecturer_topics[idx]

                # Use sentiment analysis to classify the sentence
                sentiment = sentiment_analyzer(sentence)[0]
                if sentiment["label"] == "NEGATIVE":
                    negative_topics.append(topic)
                else:
                    positive_topics.append(topic)
            else:
                # Add new topic if no match is found
                new_topic = sentence.strip()  # Use the sentence as the new topic
                lecturer_topics.append(new_topic)
                positive_topics.append(new_topic)
                print(f"New topic added: {new_topic}")

    return {
        "positive_topics": list(set(positive_topics)),  # Remove duplicates
        "negative_topics": list(set(negative_topics))   # Remove duplicates
    }

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Utilise LLM generated sentences to test and determine accuracy of model, hyper-tuning begins here.

In [15]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

# List of CSV files to process
csv_files = [
    "data/cgpt_sentences.csv",
    "data/claude_sentences.csv",
    "data/deepseek_sentences.csv",
    "data/gemini_sentences.csv"
]

# Lecturer topics
lecturer_topics = open("data/filtered_topics.csv", mode="r", encoding="utf-8").read().splitlines()[1:]  # Skip header

# Function to evaluate accuracy and generate classification report
def evaluate_predict_match(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Ensure the CSV has the required columns
    if "sentence" not in df.columns or "positive_topic" not in df.columns or "negative_topic" not in df.columns:
        print(f"Error: {file_path} does not contain the required columns.")
        return

    # Extract true labels for positive and negative topics
    true_positive_topics = df["positive_topic"].apply(lambda x: x.split(",") if pd.notna(x) else [])
    true_negative_topics = df["negative_topic"].apply(lambda x: x.split(",") if pd.notna(x) else [])

    # Predicted labels
    predicted_positive_topics = []
    predicted_negative_topics = []

    for _, row in df.iterrows():
        student_text = row["sentence"]

        # Use the predict_match function to get positive and negative topics
        result = predict_match(student_text, lecturer_topics)

        # Append predicted topics
        predicted_positive_topics.append(result["positive_topics"])
        predicted_negative_topics.append(result["negative_topics"])

    # Flatten true and predicted labels for evaluation
    true_labels = []
    predicted_labels = []

    for i in range(len(df)):
        # Positive topics
        for topic in true_positive_topics[i]:
            true_labels.append(1)  # 1 for positive interest
            predicted_labels.append(1 if topic in predicted_positive_topics[i] else 0)

        # Negative topics
        for topic in true_negative_topics[i]:
            true_labels.append(0)  # 0 for negative interest
            predicted_labels.append(0 if topic in predicted_negative_topics[i] else 1)

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predicted_labels)

    # Generate classification report
    report = classification_report(true_labels, predicted_labels, target_names=["Negative Interest", "Positive Interest"])

    # Print results
    print(f"Results for {file_path}:")
    print(f"Accuracy: {accuracy:.3f}")
    print("Classification Report:")
    print(report)
    print("-" * 50)

# Process each CSV file
for csv_file in csv_files:
    evaluate_predict_match(csv_file)

New topic added: I'm not particularly interested in Enterprise Systems and e-Commerce Application and would rather focus on other topics.
New topic added: I'm not particularly interested in Enterprise Systems and e-Commerce Application and would rather focus on other topics.
New topic added: I'm not particularly interested in Enterprise Systems and e-Commerce Application and would rather focus on other topics.
New topic added: I'm not particularly interested in Enterprise Systems and e-Commerce Application and would rather focus on other topics.
New topic added: I'm not particularly interested in Enterprise Systems and e-Commerce Application and would rather focus on other topics.
New topic added: I'm not particularly interested in Enterprise Systems and e-Commerce Application and would rather focus on other topics.
New topic added: I'm not particularly interested in Enterprise Systems and e-Commerce Application and would rather focus on other topics.
New topic added: I'm not particula

KeyboardInterrupt: 