In [2]:
# !pip install sentence-transformers pandas scikit-learn

from sentence_transformers import SentenceTransformer, util
import pandas as pd
import random
from sklearn.metrics import accuracy_score, classification_report
import csv

# Define initial topic list
initial_topics = [
    "Machine Learning", "Deep Learning", "Artificial Intelligence", "Cybersecurity",
    "Mobile App Development", "Web Development", "IoT", "Internet of Things",
    "Blockchain", "Data Science", "Big Data", "Natural Language Processing", "NLP",
    "Robotics", "Computer Vision", "Cloud Computing", "Edge Computing", "Quantum Computing",
    "Augmented Reality", "Virtual Reality", "Game Development", "Software Engineering",
    "Embedded Systems", "Autonomous Vehicles", "Digital Marketing", "E-Commerce",
    "Social Media Analytics", "Financial Technology", "Cryptography", "Network Security",
    "Ethical Hacking", "DevOps", "Agile Methodologies", "Computer Networks",
    "Distributed Systems", "High-Performance Computing", "Data Visualization",
    "Information Retrieval", "Multimedia Processing", "Cyber-Physical Systems",
    "Explainable AI", "Federated Learning", "Reinforcement Learning", "Predictive Analytics",
    "Knowledge Graphs", "Semantic Web", "Ontology Engineering"
]

# Read topics from the CSV file and append them to the topic list
with open("data\\staff_profiles.csv", mode="r", encoding="utf-8") as csv_file:
    reader = csv.DictReader(csv_file)
    for row in reader:
        # Parse the list of topics if present
        for field in ["research_interests", "teaching_areas", "courses_taught"]:
            if row.get(field, ""):
                try:
                    # Convert the string representation of a list into an actual list
                    topic_list = eval(row.get(field, "[]"))
                    if isinstance(topic_list, list):
                        # Exclude "N/A" and add valid topics to the list
                        initial_topics.extend([topic for topic in topic_list if topic and topic != "N/A"])
                except Exception as e:
                    print(f"Error parsing topics in field '{field}': {e}")

topics = list(set(initial_topics))

model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute embeddings for all topics
topic_embeddings = model.encode(topics, convert_to_tensor=True)

# Identify and remove similar topics
threshold = 0.7
unique_topics = []
for i, topic in enumerate(topics):
    is_similar = False
    for j, unique_topic in enumerate(unique_topics):
        sim = util.cos_sim(topic_embeddings[i], model.encode(unique_topic, convert_to_tensor=True))
        if sim > threshold:
            is_similar = True
            break
    if not is_similar:
        unique_topics.append(topic)

# Replace the original topics list with unique topics
topics = unique_topics

# Print the filtered topics
print("Filtered Topics:")
print(topics)

# Save the filtered topics to a CSV file
with open("data\\filtered_topics.csv", mode="w", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["Topic"])  # Write the header
    for topic in topics:
        writer.writerow([topic])  # Write each topic as a row

print("\nFiltered topics have been saved to 'data\\filtered_topics.csv'.")

student_templates = [
    "I am passionate about {topic} and want to explore more in this field.",
    "I am looking to enhance my skills in {topic}.",
    "I want to work on projects related to {topic}.",
    "I am interested in learning about {topic} and its applications.",
    "I want to be inspired by {topic} and its potential to solve real-world problems.",
    "I am eager to build a career in {topic}.",
    "I am fascinated by the opportunities in {topic} and want to contribute to its growth.",
    "I want to collaborate on research in {topic}.",
    "I want to gain hands-on experience in {topic}.",
    "I am excited to explore the latest advancements in {topic}.",
    "I want to contribute to innovative solutions in {topic}.",
    "I am curious about the challenges in {topic} and want to address them.",
    "I want to deepen my understanding of {topic} through practical experience.",
    "I am motivated to learn about {topic} and its impact on society.",
    "I want to develop cutting-edge solutions in {topic}.",
    "I am inspired by the transformative potential of {topic}.",
    "I want to apply my knowledge of {topic} to real-world scenarios.",
    "I am committed to advancing my expertise in {topic}.",
    "I want to explore interdisciplinary applications of {topic}.",
    "I am driven to innovate in the field of {topic}.",
    "I want to understand the theoretical and practical aspects of {topic}.",
    "I am excited to contribute to groundbreaking research in {topic}.",
    "I want to explore how {topic} can address global challenges.",
    "I am passionate about using {topic} to create meaningful change.",
    "I want to stay updated on the trends and developments in {topic}.",
    "I am eager to participate in projects that involve {topic}.",
    "I want to leverage {topic} to solve complex problems.",
    "I am enthusiastic about the future possibilities in {topic}.",
    "I want to learn how {topic} can be applied across industries."
]

Filtered Topics:
['Statistical methods in data science', 'Deep Learning for Data Science', 'Machine Learning', 'Artificial Intelligent and Big Data Analytics', 'Deep and Machine Learning', 'Human-Computer Interaction / Future Interaction Design', 'Programming for Data Science', 'Application tools for the Learning Disabilities Children', 'Technology intervention for the Learning Disabilities and Neurodivergent', 'Project Management', 'Visible Light Positioning', 'Multimedia Computing', 'Computer Networks', 'Energy Informatics', 'Electronic Testing – Instrumentation and Measurement', 'Sustainability', 'DevOps', 'Empirical Software Engineering', 'Malware analysis', 'Information Technology Operation Management', 'Statistics', 'Algorithm Analysis and Design', 'Electronic Design and Testing', 'Federated Learning', 'Fluid Drive System', 'IoT, its applications, and IoT Security', 'Computer vision and image processing', 'Augmented Reality', 'Information Systems and Design', 'Data Communication 

In [14]:
# Generate dataset with noise and multiple sentences per student
random.seed(42)
data = []

for _ in range(1000):  # Generate 1000 samples
    num_sentences = random.randint(2, 5)  # Each student has 2 to 5 sentences
    topic_true = random.sample(topics, num_sentences) # Randomly selects true topic
    student_topic_preference = topic_true.copy()

    # Generate multiple sentences for the same student
    
    student_sentences = [
        random.choice(student_templates).format(topic=student_topic_preference.pop(0))
        for _ in range(num_sentences)
    ]
    student_text = " ".join(student_sentences)  # Combine sentences into a single string

    # Introduce noise in 10% of the data
    if random.random() < 0.1:
        # Add noise by including irrelevant topics or mismatched topics
        lecturer_topics = random.sample(topics, 3)  # Random unrelated topics
        match = 0  # Mark as no match
    else:
        # 50% match
        if random.random() < 0.5:
            lecturer_topics = random.sample(topics, 2) + [topic_true]
            match = 1
        else:
            lecturer_topics = random.sample([t for t in topics if t != topic_true], 3)
            match = 0

    # Append the data
    data.append({
        "student_preference": student_text,
        "lecturer_topics": lecturer_topics,
        "match": match
    })

df = pd.DataFrame(data)

# Save the dataset to a CSV file
df.to_csv("data\\data_all_topics.csv", index=False)

# Print a sample of the dataset
print("\nSample dataset using all topics:")
print(df.sample(5))


Sample dataset using all topics:
                                    student_preference  \
779  I am fascinated by the opportunities in Commun...   
834  I want to explore how Device-to-Device Communi...   
406  I want to explore interdisciplinary applicatio...   
438  I am excited to contribute to groundbreaking r...   
193  I want to understand the theoretical and pract...   

                                       lecturer_topics  match  
779  [IoT, Computer System Engineering, [Communicat...      1  
834  [Intelligent transportation systems, Data Mini...      0  
406  [Identification Technology, Evolutionary Compu...      0  
438  [E-learning, Theory of Automata, Dynamics and ...      0  
193  [Wireless sensor networks, Internet of Things,...      1  


In [3]:
# Generate dataset using only initial topics
random.seed(42)
data_initial_topics = []

for _ in range(1000):  # 10000 samples
    topic_true = random.choice(initial_topics)  # Randomly select a true topic from the initial topic list
    student_text = random.choice(student_templates).format(topic=topic_true)  # Insert the topic into the template
    
    # 50% match
    if random.random() < 0.5:
        lecturer_topics = random.sample(initial_topics, 2) + [topic_true]  # Ensure lecturer topics are from initial_topics
        match = 1
    else:
        lecturer_topics = random.sample([t for t in initial_topics if t != topic_true], 3)  # Exclude the true topic
        match = 0

    data_initial_topics.append({
        "student_preference": student_text,
        "lecturer_topics": lecturer_topics,
        "match": match
    })

# Convert to DataFrame
df_initial_topics = pd.DataFrame(data_initial_topics)

# Save the dataset to a CSV file (optional)
df_initial_topics.to_csv("data\\data_initial_topics.csv", index=False)

# Print a sample of the dataset
print("\nSample dataset using initial topics:")
print(df_initial_topics.sample(5))


Sample dataset using initial topics:
                                    student_preference  \
798  I am fascino collaborate on research in Data S...   
523  I am excited to explore the latest advancement...   
378  I want to gain hands-on experience in Social M...   
499  I want to contribute to innovative solutions i...   
793  I want to gain hands-on experience in Statisti...   

                                       lecturer_topics  match  
798  [Programming Principles, Nanomaterial for Ultr...      0  
523  [Computer Science, Information System Developm...      0  
378  [Machine Learning, e-learning, e-commerce, e-h...      0  
499  [E-Commerce, Data Mining/ Data Science, Image ...      1  
793  [Quantum Computing, STEM Education, Big Data A...      0  


In [16]:
# Load Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to compute match
def predict_match(student_text, lecturer_topics, threshold=0.5):
    student_emb = model.encode(student_text, convert_to_tensor=True)
    topic_embs = model.encode(lecturer_topics, convert_to_tensor=True)
    
    similarities = util.cos_sim(student_emb, topic_embs)
    max_sim = float(similarities.max())
    
    return 1 if max_sim > threshold else 0

# Apply predict_match to the dataset using all topics
df["predicted_match"] = df.apply(
    lambda row: predict_match(row["student_preference"], row["lecturer_topics"]), axis=1
)

# Evaluate accuracy for the dataset using all topics
print("Accuracy for dataset using all topics:", accuracy_score(df["match"], df["predicted_match"]))
print("\nClassification Report for dataset using all topics:\n", classification_report(df["match"], df["predicted_match"]))

# # Apply predict_match to the dataset using only initial topics
# df_initial_topics["predicted_match"] = df_initial_topics.apply(
#     lambda row: predict_match(row["student_preference"], row["lecturer_topics"]), axis=1
# )

# # Evaluate accuracy for the dataset using only initial topics
# print("\nAccuracy for dataset using only initial topics:", accuracy_score(df_initial_topics["match"], df_initial_topics["predicted_match"]))
# print("\nClassification Report for dataset using only initial topics:\n", classification_report(df_initial_topics["match"], df_initial_topics["predicted_match"]))

Accuracy for dataset using all topics: 0.913

Classification Report for dataset using all topics:
               precision    recall  f1-score   support

           0       0.88      0.97      0.93       554
           1       0.96      0.84      0.90       446

    accuracy                           0.91      1000
   macro avg       0.92      0.91      0.91      1000
weighted avg       0.92      0.91      0.91      1000

