Generate a list of topics that will be used as reference for labeling and matching

In [1]:
import pandas as pd
import random
import csv
import re
import string

from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer, util

# Define initial topic list
initial_topics = [
    "Machine Learning", "Deep Learning", "Artificial Intelligence", "Cybersecurity",
    "Mobile App Development", "Web Development", "IoT, Internet of Things",
    "Cloud Computing", "Data Science", "Big Data", "Blockchain", "Augmented Reality",
    "Virtual Reality", "Natural Language Processing", "Computer Vision", "Robotics",
    "Networking", "Database Management", "Computer Security",
]

def clean_text(s, comma=False):
    s = s.lower()
    s = s.split()
    s = " ".join(s)
    # Add a comma if there is any '\' or '/' in the string
    if comma:
        s = re.sub(r'\\', ',', s)
        s = re.sub(r'/', ',', s)
    else:
        s = re.sub(f'[{re.escape(string.punctuation)}]', '', s)
    return s

def remove_stop_words(s):
    stop_words = set(stopwords.words('english'))
    s = s.split()
    s = [w for w in s if not w.lower() in stop_words]
    s = " ".join(s)
    return s

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Read topics from the CSV file and append them to the topic list
with open("data\\supervisors_list.csv", mode="r", encoding="utf-8-sig") as csv_file:
    reader = csv.DictReader(csv_file)
    for row in reader:
        # Parse the list of topics if present
        for field in ["Expertise Area 1", "Expertise Area 2", "Expertise Area 3"]:
            if row.get(field, ""):
                try:
                    # Convert the string representation of a list into an actual list
                    topic_list = eval(row.get(field, "[]"))
                    if isinstance(topic_list, list):
                        # Exclude "N/A" and add valid topics to the list
                        initial_topics.extend([topic for topic in topic_list if topic and topic != "N/A"])
                except Exception as e:
                    print(f"Error parsing topics in field '{field}': {e}")

new_topics = []

# Read topics from the staff_proflies CSV file and append them to a new list
with open("data\\staff_profiles.csv", mode="r", encoding="utf-8-sig") as csv_file:
    reader = csv.DictReader(csv_file)
    for row in reader:
        # Parse the list of topics if present
        for field in ["research_interests", "teaching_areas", "courses_taught"]:
            if row.get(field, ""):
                try:
                    # Convert the string representation of a list into an actual list
                    topic_list = eval(row.get(field, "[]"))
                    if isinstance(topic_list, list):
                        # Exclude "N/A" and add valid topics to the list
                        new_topics.extend([topic for topic in topic_list if topic and topic != "N/A"])
                except Exception as e:
                    print(f"Error parsing topics in field '{field}': {e}")

topics = list(set(initial_topics))
new_topics = list(set(new_topics))

model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute embeddings for all topics
topic_embeddings = model.encode(topics, convert_to_tensor=True)
new_topics_embeddings = model.encode(new_topics, convert_to_tensor=True)

# Identify and remove similar topics
def find_similar_topics(embeddings, selected_topics, threshold=0.7):
    unique_topics = []
    for i, topic in enumerate(selected_topics):
        is_similar = False
        for j, unique_topic in enumerate(unique_topics):
            sim = util.cos_sim(embeddings[i], embeddings[j])
            if sim > threshold:
                is_similar = True
                break
        if not is_similar:
            unique_topics.append(topic)
    return unique_topics

# Create a loop where the threshold increments by 0.1 for each iteration
# In each loop, use find_similar_topics and save the results in a csv file with the threshold in the filename
thresholds = [0.5, 0.6, 0.7, 0.8, 0.9]
for threshold in thresholds:
    filtered_topics = find_similar_topics(topic_embeddings, topics, threshold)
    
    # Save the filtered topics to a CSV file
    filename = f"data\\filtered_topics_threshold_{threshold}.csv"
    with open(filename, mode="w", newline="", encoding="utf-8") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Topic"])  # Write the header
        for topic in filtered_topics:
            writer.writerow([clean_text(topic, comma=True)])  # Write each topic as a row

    filtered_new_topics = find_similar_topics(new_topics_embeddings, new_topics, threshold)
    filtered_combined_topics = list(set(filtered_topics + filtered_new_topics))
    filtered_combined_topics = find_similar_topics(model.encode(filtered_combined_topics, convert_to_tensor=True), filtered_combined_topics, threshold)

    # Save the combined filtered topics to a CSV file
    filename_combined = f"data\\filtered_combined_topics_threshold_{threshold}.csv"
    with open(filename_combined, mode="w", newline="", encoding="utf-8") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Topic"])
        for topic in filtered_combined_topics:
            writer.writerow([clean_text(topic, comma=True)])

    print(f"\nFiltered topics with threshold {threshold} have been saved")


Filtered topics with threshold 0.5 have been saved

Filtered topics with threshold 0.6 have been saved

Filtered topics with threshold 0.7 have been saved

Filtered topics with threshold 0.8 have been saved

Filtered topics with threshold 0.9 have been saved


Create fucntions to label students based on the list of topcis 

In [7]:
import spacy
import nltk
import re
import string

from transformers import pipeline
from keybert import KeyBERT


# Load sentiment analysis model
sentiment_analyzer = pipeline("sentiment-analysis", device=0)  # Use GPU if available

model = SentenceTransformer('all-MiniLM-L6-v2')

nlp = spacy.load("en_core_web_sm")

kw_model = KeyBERT(model)

def extract_candidate_topics(sentence, top_n=1):
    # Use KeyBERT to extract top N keywords/phrases from the sentence
    keywords = kw_model.extract_keywords(sentence, top_n=top_n, stop_words='english')
    # Return keywords as a list of strings
    return [kw[0] for kw in keywords]

def predict_match(student_text, lecturer_topics, threshold=0.5):
    student_sentences = student_text.split(". ")  # Split student text into sentences
    positive_topics = []
    negative_topics = []

    for sentence in student_sentences:
        clean_sentence = clean_text(sentence)
        student_emb = model.encode(clean_sentence, convert_to_tensor=True)
        topic_embs = model.encode(lecturer_topics, convert_to_tensor=True)
        
        # Compute similarities between the sentence and lecturer topics
        similarities = util.cos_sim(student_emb, topic_embs)
        max_sim = float(similarities.max())  # Get the maximum similarity score

        # Check if there is any similarity in existing topics
        # If not, extract a new topic
        # and classify it as positive or negative
        if max_sim < threshold:
            new_topics = extract_candidate_topics(clean_sentence)
            for kw in new_topics:
                kw_emb = model.encode(kw, convert_to_tensor=True)
                similarities = util.cos_sim(kw_emb, topic_embs)
                max_sim_kw = float(similarities.max())  # Get the maximum similarity score
                if max_sim_kw < threshold:
                    lecturer_topics.append(kw)
                    # print(f"New topic added: {kw}")
                    # print(f"Sentence: {sentence}")
                else:
                    topic = lecturer_topics[similarities.argmax()]
                    sentiment = sentiment_analyzer(clean_sentence)[0]
                    if sentiment["label"] == "NEGATIVE":
                        negative_topics.append(clean_text(topic))
                    else:
                        positive_topics.append(clean_text(topic))
            # sentiment = sentiment_analyzer(clean_sentence)[0]
            # if sentiment["label"] == "NEGATIVE":
            #     negative_topics.append(new_topic)
            # else:
            #     positive_topics.append(new_topic)
            # continue

        # Match all relevant topics in the sentence
        for idx, sim in enumerate(similarities[0]):
            if sim > threshold:
                topic = lecturer_topics[idx]

                # Use sentiment analysis to classify the sentence
                sentiment = sentiment_analyzer(clean_sentence)[0]
                if sentiment["label"] == "NEGATIVE":
                    negative_topics.append(clean_text(topic))
                else:
                    positive_topics.append(clean_text(topic))

    return {
        "positive_topics": list(set(positive_topics)),  # Remove duplicates
        "negative_topics": list(set(negative_topics))   # Remove duplicates
    }

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


Utilise LLM generated sentences to test and determine accuracy of model, hyper-tuning begins here.

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

# List of CSV files to process
sentence_files = [
    "data/cgpt_sentences.csv",
    "data/claude_sentences.csv",
    "data/deepseek_sentences.csv",
    "data/gemini_sentences.csv",
    "data/linkedin_sentences.csv"
]
topic_files = [
    "data/filtered_combined_topics_threshold_0.5.csv",
    "data/filtered_combined_topics_threshold_0.6.csv", 
    "data/filtered_combined_topics_threshold_0.7.csv",
    "data/filtered_combined_topics_threshold_0.8.csv",
    "data/filtered_combined_topics_threshold_0.9.csv",
]

# Threshold for topic matching
thresholds = [0.5, 0.6, 0.7, 0.8, 0.9]

# Lecturer topics
lecturer_topics = open("data/filtered_topics.csv", mode="r", encoding="utf-8").read().splitlines()[1:]  # Skip header

# Function to evaluate accuracy and generate classification report
def evaluate_predict_match(file_path, lecturer_topics, threshold):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Ensure the CSV has the required columns
    if "sentence" not in df.columns or "positive_topic" not in df.columns or "negative_topic" not in df.columns:
        print(f"Error: {file_path} does not contain the required columns.")
        return

    # Extract true labels for positive and negative topics
    true_positive_topics = df["positive_topic"].apply(lambda x: x.split(",") if pd.notna(x) else [])
    true_negative_topics = df["negative_topic"].apply(lambda x: x.split(",") if pd.notna(x) else [])

    # Predicted labels
    predicted_positive_topics = []
    predicted_negative_topics = []

    for _, row in df.iterrows():
        student_text = row["sentence"]

        # Use the predict_match function to get positive and negative topics
        result = predict_match(student_text, open(lecturer_topics, mode="r", encoding="utf-8").read().splitlines(), threshold)

        # Append predicted topics
        predicted_positive_topics.append(result["positive_topics"])
        predicted_negative_topics.append(result["negative_topics"])

    # Flatten true and predicted labels for evaluation
    true_labels = []
    predicted_labels = []

    for i in range(len(df)):
        # Positive topics
        for topic in true_positive_topics[i]:
            true_labels.append(1)
            found = False
            for pred_topic in predicted_positive_topics[i]:
                pred_topic_emb = model.encode(pred_topic, convert_to_tensor=True)
                true_topic_emb = model.encode(topic, convert_to_tensor=True)
                similarity = util.cos_sim(pred_topic_emb, true_topic_emb)
                if similarity > 0.8:
                    predicted_labels.append(1)
                    found = True
                    break
            if not found:
                predicted_labels.append(0)

        # Negative topics
        for topic in true_negative_topics[i]:
            true_labels.append(0)
            found = False
            for pred_topic in predicted_negative_topics[i]:
                pred_topic_emb = model.encode(pred_topic, convert_to_tensor=True)
                true_topic_emb = model.encode(topic, convert_to_tensor=True)
                similarity = util.cos_sim(pred_topic_emb, true_topic_emb)
                if similarity > 0.8:
                    predicted_labels.append(0)
                    found = True
                    break
            if not found:
                predicted_labels.append(1)

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predicted_labels)

    # Generate classification report
    report = classification_report(true_labels, predicted_labels, target_names=["Negative Interest", "Positive Interest"])

    # Print results
    print(f"Results for {file_path} and {lecturer_topics} at threshold ({threshold}):")
    print(f"Accuracy: {accuracy:.3f}")
    print("Classification Report:")
    print(report)
    print("-" * 50)

    # store report in a text file
    with open(f"results/report_{file_path.split('/')[-1].split('.')[0]}_{lecturer_topics}_threshold_{threshold}.txt", "w") as report_file:
        report_file.write(f"Results for {file_path} and {lecturer_topics} at threshold ({threshold}):\n")
        report_file.write(f"Accuracy: {accuracy:.3f}\n")
        report_file.write("Classification Report:\n")
        report_file.write(report)
        report_file.write("-" * 50 + "\n")

    # Store results in a CSV file
    results_df = pd.DataFrame({
        "sentence": df["sentence"],
        "predicted_positive_topics": predicted_positive_topics,
        "predicted_negative_topics": predicted_negative_topics,
        "true_positive_topics": true_positive_topics,
        "true_negative_topics": true_negative_topics
    })
    results_df.to_csv(f"results/results_{file_path.split('/')[-1].split('.')[0]}_{lecturer_topics}_threshold_{threshold}.csv", index=False)

    # Print the predicted topics for each sentence
    # Print the true topics for each sentence
    # for i, row in df.iterrows():
    #     student_text = row["sentence"]
    #     print(f"Sentence: {student_text}")
    #     print(f"Predicted Positive Topics: {predicted_positive_topics[i]}")
    #     print(f"Predicted Negative Topics: {predicted_negative_topics[i]}")
    #     print(f"True Positive Topics: {true_positive_topics[i]}")
    #     print(f"True Negative Topics: {true_negative_topics[i]}")
    #     print("-" * 50)

# Process each CSV file
for threshold in thresholds:
    for lecturer_topics in topic_files:
        for sentence_file in sentence_files:
            evaluate_predict_match(sentence_file, lecturer_topics,threshold)

Results for data/cgpt_sentences.csv and data/filtered_combined_topics_threshold_0.5.csv at threshold (0.5):
Accuracy: 0.445
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.43      0.35      0.39       200
Positive Interest       0.45      0.54      0.49       200

         accuracy                           0.45       400
        macro avg       0.44      0.45      0.44       400
     weighted avg       0.44      0.45      0.44       400

--------------------------------------------------
Results for data/claude_sentences.csv and data/filtered_combined_topics_threshold_0.5.csv at threshold (0.5):
Accuracy: 0.033
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.00      0.00      0.00       264
Positive Interest       0.09      0.05      0.06       525

         accuracy                           0.03       789
        macro avg       0.04      0.02      0.03       789

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Results for data/linkedin_sentences.csv and data/filtered_combined_topics_threshold_0.5.csv at threshold (0.5):
Accuracy: 0.034
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.00      0.00      0.00         0
Positive Interest       1.00      0.03      0.07       118

         accuracy                           0.03       118
        macro avg       0.50      0.02      0.03       118
     weighted avg       1.00      0.03      0.07       118

--------------------------------------------------
Results for data/cgpt_sentences.csv and data/filtered_combined_topics_threshold_0.6.csv at threshold (0.5):
Accuracy: 0.625
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.61      0.71      0.66       200
Positive Interest       0.65      0.54      0.59       200

         accuracy                           0.62       400
        macro avg       0.63      0.62      0.62       4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Results for data/linkedin_sentences.csv and data/filtered_combined_topics_threshold_0.6.csv at threshold (0.5):
Accuracy: 0.059
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.00      0.00      0.00         0
Positive Interest       1.00      0.06      0.11       118

         accuracy                           0.06       118
        macro avg       0.50      0.03      0.06       118
     weighted avg       1.00      0.06      0.11       118

--------------------------------------------------
Results for data/cgpt_sentences.csv and data/filtered_combined_topics_threshold_0.7.csv at threshold (0.5):
Accuracy: 0.490
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.49      0.71      0.58       200
Positive Interest       0.48      0.27      0.34       200

         accuracy                           0.49       400
        macro avg       0.49      0.49      0.46       4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Results for data/linkedin_sentences.csv and data/filtered_combined_topics_threshold_0.7.csv at threshold (0.5):
Accuracy: 0.110
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.00      0.00      0.00         0
Positive Interest       1.00      0.11      0.20       118

         accuracy                           0.11       118
        macro avg       0.50      0.06      0.10       118
     weighted avg       1.00      0.11      0.20       118

--------------------------------------------------
Results for data/cgpt_sentences.csv and data/filtered_combined_topics_threshold_0.8.csv at threshold (0.5):
Accuracy: 0.672
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.66      0.71      0.69       200
Positive Interest       0.69      0.63      0.66       200

         accuracy                           0.67       400
        macro avg       0.67      0.67      0.67       4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Results for data/linkedin_sentences.csv and data/filtered_combined_topics_threshold_0.8.csv at threshold (0.5):
Accuracy: 0.144
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.00      0.00      0.00         0
Positive Interest       1.00      0.14      0.25       118

         accuracy                           0.14       118
        macro avg       0.50      0.07      0.13       118
     weighted avg       1.00      0.14      0.25       118

--------------------------------------------------
Results for data/cgpt_sentences.csv and data/filtered_combined_topics_threshold_0.9.csv at threshold (0.5):
Accuracy: 0.833
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.93      0.71      0.81       200
Positive Interest       0.77      0.95      0.85       200

         accuracy                           0.83       400
        macro avg       0.85      0.83      0.83       4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Results for data/linkedin_sentences.csv and data/filtered_combined_topics_threshold_0.9.csv at threshold (0.5):
Accuracy: 0.161
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.00      0.00      0.00         0
Positive Interest       1.00      0.16      0.28       118

         accuracy                           0.16       118
        macro avg       0.50      0.08      0.14       118
     weighted avg       1.00      0.16      0.28       118

--------------------------------------------------
Results for data/cgpt_sentences.csv and data/filtered_combined_topics_threshold_0.5.csv at threshold (0.6):
Accuracy: 0.203
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.19      0.19      0.19       200
Positive Interest       0.21      0.21      0.21       200

         accuracy                           0.20       400
        macro avg       0.20      0.20      0.20       4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Results for data/linkedin_sentences.csv and data/filtered_combined_topics_threshold_0.5.csv at threshold (0.6):
Accuracy: 0.025
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.00      0.00      0.00         0
Positive Interest       1.00      0.03      0.05       118

         accuracy                           0.03       118
        macro avg       0.50      0.01      0.02       118
     weighted avg       1.00      0.03      0.05       118

--------------------------------------------------
Results for data/cgpt_sentences.csv and data/filtered_combined_topics_threshold_0.6.csv at threshold (0.6):
Accuracy: 0.403
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.43      0.59      0.50       200
Positive Interest       0.34      0.21      0.26       200

         accuracy                           0.40       400
        macro avg       0.39      0.40      0.38       4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Results for data/linkedin_sentences.csv and data/filtered_combined_topics_threshold_0.6.csv at threshold (0.6):
Accuracy: 0.034
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.00      0.00      0.00         0
Positive Interest       1.00      0.03      0.07       118

         accuracy                           0.03       118
        macro avg       0.50      0.02      0.03       118
     weighted avg       1.00      0.03      0.07       118

--------------------------------------------------
Results for data/cgpt_sentences.csv and data/filtered_combined_topics_threshold_0.7.csv at threshold (0.6):
Accuracy: 0.388
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.42      0.59      0.49       200
Positive Interest       0.31      0.18      0.23       200

         accuracy                           0.39       400
        macro avg       0.37      0.39      0.36       4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Results for data/linkedin_sentences.csv and data/filtered_combined_topics_threshold_0.7.csv at threshold (0.6):
Accuracy: 0.076
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.00      0.00      0.00         0
Positive Interest       1.00      0.08      0.14       118

         accuracy                           0.08       118
        macro avg       0.50      0.04      0.07       118
     weighted avg       1.00      0.08      0.14       118

--------------------------------------------------
Results for data/cgpt_sentences.csv and data/filtered_combined_topics_threshold_0.8.csv at threshold (0.6):
Accuracy: 0.530
Classification Report:
                   precision    recall  f1-score   support

Negative Interest       0.53      0.59      0.56       200
Positive Interest       0.53      0.47      0.50       200

         accuracy                           0.53       400
        macro avg       0.53      0.53      0.53       4