In [9]:
# !pip install sentence-transformers pandas scikit-learn

from sentence_transformers import SentenceTransformer, util
import pandas as pd
import random
from sklearn.metrics import accuracy_score, classification_report
import csv

# Define initial topic list
initial_topics = [
    "Machine Learning", "Deep Learning", "Artificial Intelligence", "Cybersecurity",
    "Mobile App Development", "Web Development", "IoT", "Internet of Things",
    "Healthcare", "Blockchain", "Data Science", "Big Data",
    "Natural Language Processing", "NLP", "Robotics", "Computer Vision",
    "Cloud Computing", "Edge Computing", "Quantum Computing", "Augmented Reality",
    "Virtual Reality", "Game Development", "Software Engineering", "Embedded Systems",
    "Autonomous Vehicles", "Renewable Energy", "Sustainable Development",
    "Bioinformatics", "Genomics", "Medical Imaging", "Human-Computer Interaction",
    "Digital Marketing", "E-Commerce", "Social Media Analytics", "Financial Technology",
    "Cryptography", "Network Security", "Ethical Hacking", "DevOps",
    "Agile Methodologies", "Natural Resource Management", "Environmental Science",
    "Geospatial Analysis", "Urban Planning", "Supply Chain Management",
    "Operations Research", "Industrial Automation", "Control Systems",
    "Signal Processing", "Speech Recognition", "Sentiment Analysis",
    "Recommender Systems", "Knowledge Graphs", "Semantic Web", "Ontology Engineering",
    "Educational Technology", "Digital Transformation", "Smart Cities",
    "Wearable Technology", "Human-Robot Interaction", "Predictive Analytics",
    "Explainable AI", "Federated Learning", "Reinforcement Learning",
    "Computer Networks", "Distributed Systems", "High-Performance Computing",
    "Data Visualization", "Information Retrieval", "Multimedia Processing",
    "Cyber-Physical Systems", "Energy Optimization", "Additive Manufacturing",
    "3D Printing", "Nanotechnology", "Material Science", "Astrophysics",
    "Space Exploration", "Climate Change", "Agricultural Technology",
    "Food Security", "Behavioral Science", "Cognitive Science", "Psychometrics",
    "Public Health", "Epidemiology", "Pharmaceutical Technology", "Sports Analytics",
    "Transportation Systems", "Aviation Technology", "Marine Engineering",
    "Construction Technology", "Structural Engineering", "Chemical Engineering",
    "Thermodynamics", "Fluid Mechanics", "Heat Transfer", "Process Optimization"
]

# Read topics from the CSV file and append them to the topic list
with open("data\\staff_profiles.csv", mode="r", encoding="utf-8") as csv_file:
    reader = csv.DictReader(csv_file)
    for row in reader:
        # Parse the list of topics if present
        for field in ["research_interests", "teaching_areas", "courses_taught"]:
            if row.get(field, ""):
                try:
                    # Convert the string representation of a list into an actual list
                    topic_list = eval(row.get(field, "[]"))
                    if isinstance(topic_list, list):
                        # Exclude "N/A" and add valid topics to the list
                        initial_topics.extend([topic for topic in topic_list if topic and topic != "N/A"])
                except Exception as e:
                    print(f"Error parsing topics in field '{field}': {e}")

topics = list(set(initial_topics))

model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute embeddings for all topics
topic_embeddings = model.encode(topics, convert_to_tensor=True)

# Identify and remove similar topics
threshold = 0.8
unique_topics = []
for i, topic in enumerate(topics):
    is_similar = False
    for j, unique_topic in enumerate(unique_topics):
        sim = util.cos_sim(topic_embeddings[i], model.encode(unique_topic, convert_to_tensor=True))
        if sim > threshold:
            is_similar = True
            break
    if not is_similar:
        unique_topics.append(topic)

# Replace the original topics list with unique topics
topics = unique_topics

# Print the filtered topics
print("Filtered Topics:")
print(topics)

student_templates = [
    "I am passionate about {topic} and want to explore more in this field.",
    "I am looking to enhance my skills in {topic}.",
    "I want to work on projects related to {topic}.",
    "I am interested in learning about {topic} and its applications.",
    "I want to bated by {topic} and its potential to solve real-world problems.",
    "I am eager tuild a career in {topic}.",
    "I am fascino collaborate on research in {topic}.",
    "I want to gain hands-on experience in {topic}.",
    "I am excited to explore the latest advancements in {topic}.",
    "I want to contribute to innovative solutions in {topic}."
]

Filtered Topics:
['Artificial intelligence for renewable energy', 'Agile Methodologies', 'Numerical Methods', 'Materials & Manufacturing', 'Low carbon cements', 'Data Structures and Algorithms', 'Smart Energy-Efficient Buildings', 'Energy storage technologies', 'Mobile system', 'OOP', 'Electrical machines', 'Electrochemical Energy storage Devices \xa0(Supercapacitor, Fuel Cell, Batteries)', 'Enterprise Systems and e-Commerce Application', 'Concurrent Programming', 'Data Analytics using Python', 'Affordable housing', 'Information Security tools', 'Biochemical/ Bioprocess Engineering', 'Data Mining', 'IST2024 Applied Statistics', 'Mathematics', 'IT Project Management', 'Intelligent Control: Neural Networks, Fuzzy Controller, expert Systems', 'Sports Engineering and Performance Analysis', 'Process Design and Simulation', 'Solid state', 'Movement Detection', 'AI for Healthcare', 'Explainable AI', 'Optical Fiber for sensing', 'CHE1034 Organic Chemistry', 'Python Programming', 'Battery manag

In [10]:
# Generate dataset
random.seed(42)
data = []

for _ in range(10000):  # 10000 samples
    topic_true = random.choice(topics)  # Randomly select a true topic
    student_text = random.choice(student_templates).format(topic=topic_true)  # Insert the topic into the template
    
    # 50% match
    if random.random() < 0.5:
        lecturer_topics = random.sample(topics, 2) + [topic_true]
        match = 1
    else:
        lecturer_topics = random.sample([t for t in topics if t != topic_true], 3)
        match = 0

    data.append({
        "student_preference": student_text,
        "lecturer_topics": lecturer_topics,
        "match": match
    })

df = pd.DataFrame(data)

# Save the dataset to a CSV file
df.to_csv("data\\data_all_topics.csv", index=False)

# Print a sample of the dataset
print("\nSample dataset using all topics:")
print(df.sample(5))


Sample dataset using all topics:
                                     student_preference  \
8745  I want to contribute to innovative solutions i...   
211   I am eager tuild a career in Fixed Wing Drone ...   
2018  I am passionate about Data Structures and Algo...   
3915  I want to bated by Generative AI, Prompt Engin...   
7921  I am excited to explore the latest advancement...   

                                        lecturer_topics  match  
8745  [Engineering Materials, Climate Change, Requir...      0  
211   [Information Technology Operation Management, ...      0  
2018  [Vibrations, Process design and optimization, ...      1  
3915  [Genomics, Information Systems and Design, Gen...      1  
7921  [Energy Engineering, Modern Physics, Object-or...      0  


In [11]:
# Generate dataset using only initial topics
random.seed(42)
data_initial_topics = []

for _ in range(10000):  # 10000 samples
    topic_true = random.choice(initial_topics)  # Randomly select a true topic from the initial topic list
    student_text = random.choice(student_templates).format(topic=topic_true)  # Insert the topic into the template
    
    # 50% match
    if random.random() < 0.5:
        lecturer_topics = random.sample(initial_topics, 2) + [topic_true]  # Ensure lecturer topics are from initial_topics
        match = 1
    else:
        lecturer_topics = random.sample([t for t in initial_topics if t != topic_true], 3)  # Exclude the true topic
        match = 0

    data_initial_topics.append({
        "student_preference": student_text,
        "lecturer_topics": lecturer_topics,
        "match": match
    })

# Convert to DataFrame
df_initial_topics = pd.DataFrame(data_initial_topics)

# Save the dataset to a CSV file (optional)
df_initial_topics.to_csv("data\\data_initial_topics.csv", index=False)

# Print a sample of the dataset
print("\nSample dataset using initial topics:")
print(df_initial_topics.sample(5))


Sample dataset using initial topics:
                                     student_preference  \
3585  I want to gain hands-on experience in Mass Tra...   
3124  I am eager tuild a career in Earthquake engine...   
6128  I am passionate about Business Intelligence an...   
7848  I am eager tuild a career in Model checking an...   
3317  I want to work on projects related to Robotic,...   

                                        lecturer_topics  match  
3585  [Digital image processing and computer vision,...      1  
3124  [Computer Hacking Forensic Investigator, Robot...      1  
6128  [Data Analytics, C++, Python, Computational Fl...      0  
7848  [Sensors and embedded systems, Thermodynamics,...      0  
3317  [Programming for Engineers, Evolutionary Compu...      1  


In [12]:
# Load Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to compute match
def predict_match(student_text, lecturer_topics, threshold=0.6):
    student_emb = model.encode(student_text, convert_to_tensor=True)
    topic_embs = model.encode(lecturer_topics, convert_to_tensor=True)
    
    similarities = util.cos_sim(student_emb, topic_embs)
    max_sim = float(similarities.max())
    
    return 1 if max_sim > threshold else 0

# Apply predict_match to the dataset using all topics
df["predicted_match"] = df.apply(
    lambda row: predict_match(row["student_preference"], row["lecturer_topics"]), axis=1
)

# Evaluate accuracy for the dataset using all topics
print("Accuracy for dataset using all topics:", accuracy_score(df["match"], df["predicted_match"]))
print("\nClassification Report for dataset using all topics:\n", classification_report(df["match"], df["predicted_match"]))

# Apply predict_match to the dataset using only initial topics
df_initial_topics["predicted_match"] = df_initial_topics.apply(
    lambda row: predict_match(row["student_preference"], row["lecturer_topics"]), axis=1
)

# Evaluate accuracy for the dataset using only initial topics
print("\nAccuracy for dataset using only initial topics:", accuracy_score(df_initial_topics["match"], df_initial_topics["predicted_match"]))
print("\nClassification Report for dataset using only initial topics:\n", classification_report(df_initial_topics["match"], df_initial_topics["predicted_match"]))

Accuracy for dataset using all topics: 0.9439

Classification Report for dataset using all topics:
               precision    recall  f1-score   support

           0       0.90      1.00      0.95      4939
           1       1.00      0.89      0.94      5061

    accuracy                           0.94     10000
   macro avg       0.95      0.94      0.94     10000
weighted avg       0.95      0.94      0.94     10000


Accuracy for dataset using only initial topics: 0.9079

Classification Report for dataset using only initial topics:
               precision    recall  f1-score   support

           0       0.84      1.00      0.91      4940
           1       1.00      0.82      0.90      5060

    accuracy                           0.91     10000
   macro avg       0.92      0.91      0.91     10000
weighted avg       0.92      0.91      0.91     10000

