# USE GEMINI TO LABEL STUDENT PREFERENCES ACCORDING TO TOPICS

## Import supervisor list and remove duplicate topics

In [1]:
import csv
import pandas as pd

def import_supervisors(file_path):
    supervisors = []
    try:
        with open(file_path, mode='r', encoding='utf-8') as file:
            csv_reader = csv.DictReader(file)
            for row in csv_reader:
                supervisors.append(row)
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
    return supervisors  


def supervisors_to_dataframe(supervisors_csv):
    try:
        df = pd.DataFrame(supervisors_csv)
        return df
    except Exception as e:
        print(f"An error occurred while converting to DataFrame: {e}")
        return None 
    

def combine_expertise_topics(row, expertise_columns):
    """Helper function to combine topics from multiple expertise areas"""
    all_topics = []
    for col in expertise_columns:
        if row[col]:
            # Handle if input is already a list or string
            topics = row[col] if isinstance(row[col], list) else eval(str(row[col]))
            # Clean each topic in the list
            cleaned_topics = [t.strip() for t in topics if t.strip()]
            all_topics.extend(cleaned_topics)
    # Remove duplicates while preserving order
    unique_topics = list(dict.fromkeys(all_topics))
    return ', '.join(unique_topics)

# Generate supervisor ID and a randomised capacity
# Add a 'topics' column that is baed on the 'Expertise Area 1', 'Expertise Area 2', and 'Expertise Area 3' columns
def generate_supervisor_data(supervisors_df):
    if supervisors_df is None or supervisors_df.empty:
        print("No data to process.")
        return None

    supervisors_df['supervisor_id'] = range(1, len(supervisors_df) + 1)
    
    import random
    supervisors_df['capacity'] = [random.randint(3, 10) for _ in range(len(supervisors_df))]
    
    expertise_columns = ['Expertise Area 1', 'Expertise Area 2', 'Expertise Area 3']
    supervisors_df['topics'] = supervisors_df.apply(
        lambda x: combine_expertise_topics(x, expertise_columns), 
        axis=1
    )
    
    return supervisors_df


## Use Gemini to standardise supervisor topics for easier labeling

In [2]:
import google.generativeai as genai
import json
import os
from IPython.display import display, Markdown # For better display in notebooks
from ast import literal_eval

os.environ['GOOGLE_API_KEY'] = 'AIzaSyBr8aF6h4Vp1LpwxbKtD8KvuaCfUcl-2MM'

# --- Configuration ---
try:
    # Attempt to configure from environment variable
    if "GOOGLE_API_KEY" not in os.environ:
        print("Warning: GOOGLE_API_KEY environment variable not set.")
    genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
except Exception as e:
    print(f"Error configuring Gemini API: {e}")
    print("Please ensure your GOOGLE_API_KEY is correctly set.")
    exit(1)

try:
    model = genai.GenerativeModel('gemini-2.0-flash')
except Exception as e:
    print(f"Error initializing Gemini model: {e}")
    model = None


# --- Helper Functions ---
def extract_unique_expertise_terms(df, expertise_cols):
    """Extracts all unique, non-empty expertise terms from specified columns."""
    all_terms = set()
    for col in expertise_cols:
        # Ensure column exists and handle potential errors if it doesn't
        if col in df.columns:
            # Drop NaNs
            col_data = df[col].dropna()
            for item in col_data:
                # If the cell is a list, extend; if string, treat as single topic
                if isinstance(item, list):
                    all_terms.update([t.strip() for t in item if t and str(t).strip()])
                else:
                    # Try to parse string representation of list, else treat as single string
                    try:
                        parsed = eval(item) if isinstance(item, str) and item.startswith("[") else item
                        if isinstance(parsed, list):
                            all_terms.update([t.strip() for t in parsed if t and str(t).strip()])
                        else:
                            if str(parsed).strip():
                                all_terms.add(str(parsed).strip())
                    except Exception:
                        if str(item).strip():
                            all_terms.add(str(item).strip())
        else:
            print(f"Warning: Column '{col}' not found in DataFrame.")
    return sorted(list(all_terms))

def get_standardisation_map_from_gemini(unique_terms_list):
    """
    Sends a list of unique expertise terms to Gemini and asks for a standardisation map.
    Returns a dictionary: {"original_term": "standardised_term"}.
    """
    if not model:
        print("Gemini model not initialized. Cannot proceed.")
        return None
    if not unique_terms_list:
        print("No unique terms provided to standardise.")
        return {}

    prompt = f"""
    You are an expert academic research field categorizer and data normalizer.
    I have a list of expertise areas extracted from a dataset of supervisors.
    Many of these terms are variations of the same concept (e.g., "IoT", "Internet of Things", "Industrial IoT")
    or very closely related.

    Your task is to analyze the following list of unique expertise terms and create a JSON object
    that maps each original term to a single, consistent, standardised "umbrella" term.
    Your aim is to reduce redundancy and ensure that similar or synonymous terms are grouped under 
    a single standardised term to be used for labeling and categorization of student's preferences in a university database.

    Guidelines:
    1. The standardised term should be a concise and commonly understood representation of the concept.
    2. If an original term is already a good standard, it can map to itself.
    3. Group synonymous or similar terms under ONE standardised term. For example, if "Machine Learning", "ML", and "Deep Learning" are present, they might all map to "Machine Learning" or you might decide "Deep Learning" should map to "Deep Learning" if it's distinct enough, while "ML" maps to "Machine Learning". Use your best judgment to create meaningful umbrella terms.
    4. The output MUST be a single JSON object where keys are the *original* expertise terms from the input list, and values are their corresponding *standardised* umbrella terms. Every term from the input list must be a key in the output JSON.
    5. Do not include any explanatory text outside the JSON object. Just the JSON.

    List of unique expertise terms:
    {json.dumps(unique_terms_list)}

    Please provide the JSON mapping:
    """

    print("Sending request to Gemini API...")
    try:
        response = model.generate_content(prompt)
        # Gemini API can sometimes wrap JSON in markdown backticks
        cleaned_response_text = response.text.strip().removeprefix("```json").removeprefix("```").removesuffix("```").strip()

        # Validate and parse JSON
        try:
            standardisation_map = json.loads(cleaned_response_text)
            # Basic validation: ensure it's a dict
            if not isinstance(standardisation_map, dict):
                print("Error: Gemini did not return a valid JSON dictionary.")
                print("Raw response:", response.text)
                return None
            # Ensure all original terms are keys
            missing_keys = [term for term in unique_terms_list if term not in standardisation_map]
            if missing_keys:
                print(f"Warning: Gemini's map is missing keys for: {missing_keys}")
                for key in missing_keys:
                    standardisation_map[key] = key # self-mapping
            return standardisation_map
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from Gemini: {e}")
            print("Raw response text from Gemini:")
            print(response.text) # print the raw response for debugging
            return None
    except Exception as e:
        print(f"Error calling Gemini API: {e}")
        if hasattr(e, 'response') and e.response: # More detailed error if available
            print(f"Gemini API Error Details: {e.response}")
        return None

# --- Main Processing ---

# 1. Load CSV
csv_file_path = 'data\\supervisors_list.csv' # <--- CHANGE FILENAME
expertise_columns = ['Expertise Area 1', 'Expertise Area 2', 'Expertise Area 3']

try:
    supervisors_df = pd.read_csv(csv_file_path)
    for col in expertise_columns:
        if col not in supervisors_df.columns:
            print(f"Warning: Column '{col}' not found in CSV. Skipping standardisation for this column.")
            expertise_columns.remove(col)
        else:
            # Ensure expertise columns are treated as lists
            supervisors_df[col] = supervisors_df[col].apply(literal_eval)
except FileNotFoundError:
    print(f"Error: '{csv_file_path}' not found. Using dummy data for demonstration.")
    data = {
        'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
        'Department': ['CS', 'CS', 'AI', 'CS', 'EE'],
        'Preferred Programme for Supervision (1st Choice)': ['PhD CS', 'MSc AI', 'PhD AI', 'MSc DS', 'PhD EE'],
        'Preferred Programme for Supervision (2nd Choice)': ['MSc AI', 'PhD CS', 'MSc DS', 'PhD CS', 'MSc CS'],
        'Expertise Area 1': ['Machine Learning', 'Software Architecture', 'Natural Language Processing', 'Data Mining', 'IoT'],
        'Expertise Area 2': ['Deep Learning', 'Agile Development', pd.NA, 'Big Data Analytics', 'Internet of Things'],
        'Expertise Area 3': ['Computer Vision', pd.NA, 'Ethics in AI', 'Cloud Computing', 'Industrial IoT']
    }
    supervisors_df = pd.DataFrame(data)

print("Original DataFrame sample:")
display(supervisors_df.head())

# 2. Extract All Unique Expertise Terms
unique_terms = extract_unique_expertise_terms(supervisors_df, expertise_columns)
if not unique_terms:
    print("No expertise terms found to process. Exiting.")
    exit()
else:
    print(f"\nFound {len(unique_terms)} unique expertise terms to standardise:")
    print(unique_terms)

    # 3. Get standardisation Map from Gemini (only if model initialized and terms exist)
    standardisation_dictionary = None
    if model and unique_terms:
        standardisation_dictionary = get_standardisation_map_from_gemini(unique_terms)

    if standardisation_dictionary:
        print("\n--- standardisation Map from Gemini (Review this carefully!) ---")
        # Pretty print the dictionary for review
        display(Markdown("```json\n" + json.dumps(standardisation_dictionary, indent=2) + "\n```"))

        # --- Maybe include manual review here? ---

        # 4. Apply Mapping to Create standardised Expertise Columns
        print("\nApplying standardisation map to DataFrame...")
        for i, col_name in enumerate(expertise_columns):
            if col_name in supervisors_df.columns:
                standardised_col_name = f'standardised Expertise {i+1}'
                supervisors_df[standardised_col_name] = supervisors_df[col_name].apply(
                    lambda topics: [standardisation_dictionary.get(t.strip(), t.strip()) for t in topics] if isinstance(topics, list)
                    else [standardisation_dictionary.get(str(topics), str(topics))] if pd.notna(topics) and str(topics).strip()
                    else []
                )
                print(supervisors_df[standardised_col_name])
            else:
                print(f"Skipping standardisation for non-existent column: {col_name}")


        # 5. Combine standardised Expertise into a single columnI
        standardised_expertise_cols = [f'standardised Expertise {i+1}' for i in range(len(expertise_columns)) if f'standardised Expertise {i+1}' in supervisors_df.columns]

        if standardised_expertise_cols: # only proceed if standardised columns were created
            supervisors_df['standardised Topics'] = supervisors_df.apply(
                lambda x: combine_expertise_topics(x, standardised_expertise_cols),
                axis=1
            )

            print("\nDataFrame with standardised Expertise:")
            display(supervisors_df[['Name'] + expertise_columns + standardised_expertise_cols + ['standardised Topics']].head())

            # 6. Save Outputs
            # Save the standardisation map to a JSON file
            map_output_path = 'data\\gemini_standardisation_map.json'
            with open(map_output_path, 'w') as f:
                json.dump(standardisation_dictionary, f, indent=4)
            print(f"\nstandardisation map saved to: {map_output_path}")

            # Save the augmented DataFrame to CSV
            csv_output_path = 'data\\supervisors_standardised_gemini.csv'
            supervisors_df.to_csv(csv_output_path, index=False)
            print(f"Augmented DataFrame saved to CSV: {csv_output_path}")

            # Example: Further manipulation - unique standardised topics
            if 'standardised Topics' in supervisors_df.columns:
                unique_standardised_topics_list = supervisors_df['standardised Topics'].str.split(', ').explode().str.strip()
                unique_standardised_topics_list = unique_standardised_topics_list[unique_standardised_topics_list != ''].unique()
                print("\nUnique individual standardised topic terms found across all supervisors:")
                print(sorted(list(unique_standardised_topics_list)))

                # Save unique standardised topics to CSV
                unique_topics_df = pd.DataFrame({'standardised Topic': sorted(list(unique_standardised_topics_list))})
                unique_topics_df.to_csv('data\\unique_standardised_topics.csv', index=False)
                print("Unique standardised topics saved to: data\\unique_standardised_topics.csv")
        else:
            print("\nNo standardised expertise columns were created. Skipping combination and saving of DataFrame.")

    else:
        print("\nFailed to get standardisation map from Gemini. No changes applied to DataFrame.")

Original DataFrame sample:


  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,Name,Department,Preferred Programme for Supervision (1st Choice),Preferred Programme for Supervision (2nd Choice),Expertise Area 1,Expertise Area 2,Expertise Area 3
0,Ali Afzalian Mand,DDSAI,No Preference,No Preference,[Machine Learning Theory],[AI for Healthcare],"[Deep Learning, Neural Networks]"
1,Assoc. Prof. Dr Anwar P.P. Abdul Majeed,DDSAI,BSDA,BCS / BSE / BIT,"[Machine Learning, Deep Learning]",[Data Analytics],[Robotics]
2,Assoc. Prof. Dr Azam Che Idris,DDSAI,BSDA,BCS / BSE / BIT,"[DEEP LEARNING, MACHINE LEARNING]",[TIME SERIES ANALYSIS],[COMPUTER VISION]
3,Assoc. Prof. Dr Muhammed Basheer Jasser,DDSAI,BCS / BSE / BIT,BSDA,"[Machine Learning, Artificial Intelligence]",[Swarm and Evolutionary Computing],"[Software Engineering, Software Modeling]"
4,Assoc. Prof. Dr Aslina Baharum,DDSAI,BCS / BSE / BIT,BSDA,"[AI-UX, UX/UI Research & Design, HCI, Interact...","[Software Engineering & Development, Informati...",[Information and Communication Technology (ICT...



Found 116 unique expertise terms to standardise:
['AI', 'AI applications in Robotics', 'AI for Healthcare', 'AI-UX', 'AR', 'Agentic AI', 'Antenna Design', 'Application Development', 'Application development', 'Applied AI', 'Applied Generative AI', 'Applied Machine Learning', 'Applied machine learning', 'Artificial Intelligence', 'Automated Test and Measurement Systems', 'Battery Energy Storage Management', 'Big Data Analysis', 'Blockchain', 'COMPUTER VISION', 'Chatbots', 'Cloud Computing', 'Clustering Algorithms & Optimization', 'Commercial Projects', 'Computational Intelligence', 'Computer Engineering', 'Computer Graphic', 'Computer Networking', 'Computer Networks', 'Computer Science', 'Computer Vision', 'Computer Vision & Image Processing', 'Computing study with qualitative & quantitative data (survey,interview)', 'Cybersecurity', 'DEEP LEARNING', 'Data Analytics', 'Data Mining', 'Databases', 'Deep Learning', 'Deep learning', 'Development', 'Digital Image Processing', 'Distributed S

```json
{
  "AI": "Artificial Intelligence",
  "AI applications in Robotics": "Robotics",
  "AI for Healthcare": "AI for Healthcare",
  "AI-UX": "UX/UI Design",
  "AR": "Augmented Reality",
  "Agentic AI": "Artificial Intelligence",
  "Antenna Design": "Antenna Design",
  "Application Development": "Application Development",
  "Application development": "Application Development",
  "Applied AI": "Artificial Intelligence",
  "Applied Generative AI": "Generative AI",
  "Applied Machine Learning": "Machine Learning",
  "Applied machine learning": "Machine Learning",
  "Artificial Intelligence": "Artificial Intelligence",
  "Automated Test and Measurement Systems": "Automated Test and Measurement Systems",
  "Battery Energy Storage Management": "Renewable Energy System Management",
  "Big Data Analysis": "Data Analytics",
  "Blockchain": "Blockchain",
  "COMPUTER VISION": "Computer Vision",
  "Chatbots": "Natural Language Processing",
  "Cloud Computing": "Cloud Computing",
  "Clustering Algorithms & Optimization": "Clustering Algorithms & Optimization",
  "Commercial Projects": "Commercial Projects",
  "Computational Intelligence": "Artificial Intelligence",
  "Computer Engineering": "Computer Engineering",
  "Computer Graphic": "Computer Graphics",
  "Computer Networking": "Computer Networking",
  "Computer Networks": "Computer Networking",
  "Computer Science": "Computer Science",
  "Computer Vision": "Computer Vision",
  "Computer Vision & Image Processing": "Computer Vision",
  "Computing study with qualitative & quantitative data (survey,interview)": "Qualitative Research",
  "Cybersecurity": "Cybersecurity",
  "DEEP LEARNING": "Deep Learning",
  "Data Analytics": "Data Analytics",
  "Data Mining": "Data Mining",
  "Databases": "Databases",
  "Deep Learning": "Deep Learning",
  "Deep learning": "Deep Learning",
  "Development": "Software Engineering",
  "Digital Image Processing": "Image Processing",
  "Distributed System": "Distributed Systems",
  "Distributed haptics": "Distributed Haptics",
  "E-commerce games": "E-commerce games",
  "Electronics": "Electronics",
  "Embedded System": "Embedded Systems",
  "Embedded System Development": "Embedded Systems",
  "Embedded system applications": "Embedded Systems",
  "Embeded Systems": "Embedded Systems",
  "Environment": "Environment",
  "Extended reality (VR,AR,MR)": "Extended Reality",
  "Fiber Optic Sensor": "Fiber Optic Sensor",
  "GenAI": "Generative AI",
  "Generative AI Usage Ethics": "Generative AI",
  "Green computing": "Green Computing",
  "HCI": "Human-Computer Interaction",
  "High-speed computer and Telecommunications networks": "Computer Networking",
  "Image Processing": "Image Processing",
  "Image and computer vision": "Computer Vision",
  "Industrial IoT": "Internet of Things",
  "Information Security": "Cybersecurity",
  "Information System": "Information Systems",
  "Information Visualization & Analytics": "Information Visualization",
  "Information and Communication Technology (ICT)/ Information Technology (IT)/ Multimedia/ Information System (IS)": "Information Technology",
  "Interaction Design": "Interaction Design",
  "Internet of Things (IoT)": "Internet of Things",
  "IoT": "Internet of Things",
  "IoT Applications": "Internet of Things",
  "MACHINE LEARNING": "Machine Learning",
  "Machine": "Machine Learning",
  "Machine Learning": "Machine Learning",
  "Machine Learning Theory": "Machine Learning",
  "Machine Learning\\Deep learning": "Deep Learning",
  "Machine learning": "Machine Learning",
  "Mining": "Data Mining",
  "Mixed Reality": "Mixed Reality",
  "Mobile Application Development": "Mobile Development",
  "Mobile Cellular Networks": "Wireless Communication",
  "Nanomaterial for Ultrashort Fiber Laser": "Nanomaterials",
  "Natural Language Processing": "Natural Language Processing",
  "Network": "Computer Networking",
  "Network Coding": "Network Coding",
  "Network Security": "Network Security",
  "Network architectures and protocols": "Computer Networking",
  "Neural Networks": "Neural Networks",
  "Neuroscience": "Neuroscience",
  "Operational optimisation for sustainability": "Operational Optimisation",
  "Pattern Recognition": "Pattern Recognition",
  "Photonic Devices": "Photonic Devices",
  "Product/Service Design": "Product Design",
  "Qualitative study": "Qualitative Research",
  "Renewable Energy System Management": "Renewable Energy System Management",
  "Robotics": "Robotics",
  "Signal Processing": "Signal Processing",
  "Smart transportation system": "Smart Cities",
  "Software Engineering": "Software Engineering",
  "Software Engineering & Development": "Software Engineering",
  "Software Modeling": "Software Engineering",
  "Statistical methods in data science": "Data Science",
  "Sustainable smart city": "Smart Cities",
  "Swarm and Evolutionary Computing": "Swarm and Evolutionary Computing",
  "TIME SERIES ANALYSIS": "Time Series Analysis",
  "Time Series Analysis": "Time Series Analysis",
  "TinyML": "Embedded Systems",
  "UI and UX": "UX/UI Design",
  "UX/UI Research & Design": "UX/UI Design",
  "Ultrasound Indoor Localization": "Ultrasound Indoor Localization",
  "VR": "Virtual Reality",
  "Wireless Communication": "Wireless Communication",
  "Wireless Networks": "Wireless Communication",
  "data analytics": "Data Analytics",
  "deep learning": "Deep Learning",
  "distributed systems": "Distributed Systems",
  "health": "Healthcare",
  "mobile development": "Mobile Development",
  "signal processing": "Signal Processing"
}
```


Applying standardisation map to DataFrame...
0                                    [Machine Learning]
1                     [Machine Learning, Deep Learning]
2                     [Deep Learning, Machine Learning]
3           [Machine Learning, Artificial Intelligence]
4     [UX/UI Design, UX/UI Design, Human-Computer In...
5               [Neuroscience, Healthcare, Environment]
6                     [Deep Learning, Machine Learning]
7                             [Artificial Intelligence]
8                                     [Computer Vision]
9                             [Artificial Intelligence]
10    [Mixed Reality, Augmented Reality, Virtual Rea...
11                            [Artificial Intelligence]
12                                   [Machine Learning]
13                                        [Data Mining]
14                            [Artificial Intelligence]
15    [Virtual Reality, Augmented Reality, Computer ...
16                            [Artificial Intelligence]
17

Unnamed: 0,Name,Expertise Area 1,Expertise Area 2,Expertise Area 3,standardised Expertise 1,standardised Expertise 2,standardised Expertise 3,standardised Topics
0,Ali Afzalian Mand,[Machine Learning Theory],[AI for Healthcare],"[Deep Learning, Neural Networks]",[Machine Learning],[AI for Healthcare],"[Deep Learning, Neural Networks]","Machine Learning, AI for Healthcare, Deep Lear..."
1,Assoc. Prof. Dr Anwar P.P. Abdul Majeed,"[Machine Learning, Deep Learning]",[Data Analytics],[Robotics],"[Machine Learning, Deep Learning]",[Data Analytics],[Robotics],"Machine Learning, Deep Learning, Data Analytic..."
2,Assoc. Prof. Dr Azam Che Idris,"[DEEP LEARNING, MACHINE LEARNING]",[TIME SERIES ANALYSIS],[COMPUTER VISION],"[Deep Learning, Machine Learning]",[Time Series Analysis],[Computer Vision],"Deep Learning, Machine Learning, Time Series A..."
3,Assoc. Prof. Dr Muhammed Basheer Jasser,"[Machine Learning, Artificial Intelligence]",[Swarm and Evolutionary Computing],"[Software Engineering, Software Modeling]","[Machine Learning, Artificial Intelligence]",[Swarm and Evolutionary Computing],"[Software Engineering, Software Engineering]","Machine Learning, Artificial Intelligence, Swa..."
4,Assoc. Prof. Dr Aslina Baharum,"[AI-UX, UX/UI Research & Design, HCI, Interact...","[Software Engineering & Development, Informati...",[Information and Communication Technology (ICT...,"[UX/UI Design, UX/UI Design, Human-Computer In...","[Software Engineering, Information Visualization]","[Information Technology, Computer Science]","UX/UI Design, Human-Computer Interaction, Inte..."



standardisation map saved to: data\gemini_standardisation_map.json
Augmented DataFrame saved to CSV: data\supervisors_standardised_gemini.csv

Unique individual standardised topic terms found across all supervisors:
['AI for Healthcare', 'Antenna Design', 'Application Development', 'Artificial Intelligence', 'Augmented Reality', 'Automated Test and Measurement Systems', 'Blockchain', 'Cloud Computing', 'Clustering Algorithms & Optimization', 'Commercial Projects', 'Computer Engineering', 'Computer Graphics', 'Computer Networking', 'Computer Science', 'Computer Vision', 'Cybersecurity', 'Data Analytics', 'Data Mining', 'Data Science', 'Databases', 'Deep Learning', 'Distributed Haptics', 'Distributed Systems', 'E-commerce games', 'Electronics', 'Embedded Systems', 'Environment', 'Extended Reality', 'Fiber Optic Sensor', 'Generative AI', 'Green Computing', 'Healthcare', 'Human-Computer Interaction', 'Image Processing', 'Information Systems', 'Information Technology', 'Information Visuali

## Test labeling accuracy

In [3]:
import google.generativeai as genai
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
import os
import io
import json
import time
import math

# --- Configuration ---
GEMINI_MODEL_NAME = "gemini-2.0-flash"
STUDENT_PREFERENCES_CSV = "data\\claude_sentences.csv"
STANDARDIZED_TOPICS_CSV = "data\\unique_standardised_topics.csv"
OUTPUT_CSV_WITH_GEMINI_LABELS = "data\\gemini_labeled_preferences.csv"
API_RETRY_LIMIT = 3
API_RETRY_DELAY_SECONDS = 5
BATCH_SIZE = 50
DELAY_BETWEEN_BATCHES_SECONDS = 2
os.environ['GOOGLE_API_KEY'] = 'AIzaSyBr8aF6h4Vp1LpwxbKtD8KvuaCfUcl-2MM'

# --- 1. Configure Gemini API ---
try:
    api_key = os.getenv("GOOGLE_API_KEY")
    if not api_key:
        raise ValueError("GOOGLE_API_KEY environment variable not set.")
    genai.configure(api_key=api_key)
except Exception as e:
    print(f"Error configuring Gemini API: {e}")
    exit()

# --- 2. Load Data ---
try:
    student_df_full = pd.read_csv(STUDENT_PREFERENCES_CSV)
    topics_df = pd.read_csv(STANDARDIZED_TOPICS_CSV)
except FileNotFoundError as e:
    print(f"Error: {e}. Make sure your CSV files are in the correct path.")
    exit()
except pd.errors.EmptyDataError as e:
    print(f"Error: {e}. One of your CSV files might be empty.")
    exit()


# Ensure expected columns exist in student_df
required_cols = ['Entry', 'Positive_Topics', 'Negative_Topics']
if not all(col in student_df_full.columns for col in required_cols):
    print(f"Error: {STUDENT_PREFERENCES_CSV} must contain columns: {', '.join(required_cols)}")
    exit()

student_df_full.rename(columns={
    'Entry': 'SentenceText',
    'Positive_Topics': 'Human_Positive_Topics',
    'Negative_Topics': 'Human_Negative_Topics'
}, inplace=True)

# Add sentence ID if not present
if 'SentenceID' not in student_df_full.columns:
    student_df_full['SentenceID'] = student_df_full.index + 1  # Create a simple ID based on index

# Get the list of standardized topics
if topics_df.empty or topics_df.columns.empty:
    print(f"Error: {STANDARDIZED_TOPICS_CSV} is empty or has no columns. It should have one column with topics.")
    exit()
standardized_topic_list = topics_df.iloc[:, 0].astype(str).str.strip().unique().tolist()
print(f"Loaded {len(standardized_topic_list)} standardized topics: {standardized_topic_list[:5]}...") # Print first 5

# --- 3. Helper function to create prompt for a batch ---
def create_prompt_for_batch(batch_sentences_list, all_standardized_topics):
    sentences_json_for_prompt = json.dumps(batch_sentences_list, indent=2)
    prompt = f"""
You are an expert AI assistant specialized in classifying student project preferences.
Your task is to label a list of student preference sentences with relevant project topics, both positive and negative.
You MUST use ONLY the topics from the provided standardized list.

Standardized Topics List:
{', '.join(all_standardized_topics)}

Input Sentences for this batch (as a JSON array of objects):
{sentences_json_for_prompt}

Instructions:
1.  For each sentence object in the input JSON array, analyze the "SentenceText".
2.  Identify topics the student expresses a POSITIVE preference for.
3.  Identify topics the student expresses a NEGATIVE preference for.
4.  Topics MUST be chosen EXACTLY from the 'Standardized Topics List' above. Do not invent new topics or use variations. IF there is NO MATCH, label it as 'No Match'.
5.  Your output MUST be a valid JSON array of objects.
6.  Each object in your output array should correspond to an input sentence and have the following keys:
    *   "SentenceID": (string) The ID from the input sentence object.
    *   "Gemini_Positive_Topics": (array of strings) A list of positive topics. If no positive topics, label it as 'No Match'.
    *   "Gemini_Negative_Topics": (array of strings) A list of negative topics. If no negative topics, label it as 'No Match'.
7.  Ensure every SentenceID from the input batch is present in your output JSON array.
8.  Do NOT include the original 'SentenceText' in your output JSON, only the specified keys.

Example of expected output JSON format:
[
  {{
    "SentenceID": "S001",
    "Gemini_Positive_Topics": ["Machine Learning", "Artificial Intelligence"],
    "Gemini_Negative_Topics": ["Web Development"]
  }},
  {{
    "SentenceID": "S002",
    "Gemini_Positive_Topics": ["Data Science"],
    "Gemini_Negative_Topics": []
  }}
]

Begin your JSON output now (ensure it's a single, valid JSON array for this batch):
"""
    return prompt

# --- 4. Process Sentences in Batches ---
model = genai.GenerativeModel(GEMINI_MODEL_NAME)
all_gemini_results = [] # To store results from all batches

num_batches = math.ceil(len(student_df_full) / BATCH_SIZE)
print(f"Processing in {num_batches} batches of size up to {BATCH_SIZE}.")

for i in range(num_batches):
    start_index = i * BATCH_SIZE
    end_index = start_index + BATCH_SIZE
    batch_df = student_df_full.iloc[start_index:end_index]

    print(f"\n--- Processing Batch {i+1}/{num_batches} ({len(batch_df)} sentences) ---")

    if batch_df.empty:
        print("Batch is empty, skipping.")
        continue

    # Prepare list of sentences for the current batch's prompt
    batch_sentences_to_label_list = []
    for _, row in batch_df.iterrows():
        batch_sentences_to_label_list.append({
            "SentenceID": str(row['SentenceID']),
            "SentenceText": row['SentenceText']
        })

    batch_prompt = create_prompt_for_batch(batch_sentences_to_label_list, standardized_topic_list)

    gemini_output_json_str = None
    current_batch_results = None

    for attempt in range(API_RETRY_LIMIT):
        try:
            print(f"Attempt {attempt + 1}/{API_RETRY_LIMIT} for batch {i+1}...")
            response = model.generate_content(
                batch_prompt,
                generation_config=genai.types.GenerationConfig(
                    # temperature=0.1
                )
            )
            if not response.parts:
                if response.prompt_feedback and response.prompt_feedback.block_reason:
                    print(f"Warning: Prompt for batch {i+1} was blocked. Reason: {response.prompt_feedback.block_reason}")
                else:
                    print(f"Warning: Gemini response for batch {i+1} has no parts.")
                if attempt < API_RETRY_LIMIT - 1:
                    print(f"Retrying batch {i+1} in {API_RETRY_DELAY_SECONDS} seconds...")
                    time.sleep(API_RETRY_DELAY_SECONDS)
                    continue
                else:
                    print(f"Max retries reached for problematic response for batch {i+1}. Skipping this batch.")
                    break # Break from retry loop for this batch

            gemini_output_json_str = response.text.strip()

            if gemini_output_json_str.startswith("```json"):
                gemini_output_json_str = gemini_output_json_str[len("```json"):].strip()
            if gemini_output_json_str.endswith("```"):
                gemini_output_json_str = gemini_output_json_str[:-len("```")].strip()

            first_char = gemini_output_json_str[0] if gemini_output_json_str else ''
            last_char = gemini_output_json_str[-1] if gemini_output_json_str else ''
            if not ((first_char == '[' and last_char == ']')):
                json_start_index = gemini_output_json_str.find('[')
                json_end_index = gemini_output_json_str.rfind(']')
                if json_start_index != -1 and json_end_index > json_start_index :
                    gemini_output_json_str = gemini_output_json_str[json_start_index : json_end_index+1]
                else:
                    raise ValueError("Could not reliably extract JSON array from Gemini response for this batch.")

            current_batch_results = json.loads(gemini_output_json_str)
            if not isinstance(current_batch_results, list):
                raise ValueError("Gemini's output for batch was not a JSON list as expected.")
            
            print(f"Successfully processed batch {i+1}. Received {len(current_batch_results)} results.")
            all_gemini_results.extend(current_batch_results)
            break # Successful processing of this batch

        except json.JSONDecodeError as e:
            print(f"Error parsing Gemini's JSON output for batch {i+1} (attempt {attempt+1}): {e}")
            print("Raw output snippet:", gemini_output_json_str[:200] if gemini_output_json_str else "None")
            if attempt < API_RETRY_LIMIT - 1:
                time.sleep(API_RETRY_DELAY_SECONDS)
            else:
                print(f"Failed to parse JSON for batch {i+1} after {API_RETRY_LIMIT} attempts. Skipping this batch.")
        except Exception as e:
            print(f"Error during Gemini API call or processing for batch {i+1} (attempt {attempt+1}): {e}")
            if hasattr(response, 'prompt_feedback') and response.prompt_feedback:
                 print(f"Prompt Feedback: {response.prompt_feedback}")
            if attempt < API_RETRY_LIMIT - 1:
                time.sleep(API_RETRY_DELAY_SECONDS)
            else:
                print(f"Failed to process batch {i+1} after {API_RETRY_LIMIT} attempts. Skipping this batch.")
    
    # Optional: Add a small delay between batch calls to be polite to the API
    if i < num_batches - 1: # Don't sleep after the last batch
        print(f"Waiting {DELAY_BETWEEN_BATCHES_SECONDS}s before next batch...")
        time.sleep(DELAY_BETWEEN_BATCHES_SECONDS)


if not all_gemini_results:
    print("\nNo results were successfully processed from Gemini. Exiting.")
    exit()

# --- 5. Convert All Gemini Results to DataFrame ---
gemini_df = pd.DataFrame(all_gemini_results)
if gemini_df.empty:
    print("\nGemini DataFrame is empty after processing all batches. Exiting.")
    exit()

expected_gemini_cols = ["SentenceID", "Gemini_Positive_Topics", "Gemini_Negative_Topics"]
if not all(col in gemini_df.columns for col in expected_gemini_cols):
    missing_cols = [col for col in expected_gemini_cols if col not in gemini_df.columns]
    print(f"Warning: Gemini's combined JSON output is missing columns: {', '.join(missing_cols)}. Will try to proceed.")
    # Fill missing columns with empty lists if they are essential for later steps
    for mc in missing_cols:
        if mc not in gemini_df.columns: # Check again, just in case
             gemini_df[mc] = [[] for _ in range(len(gemini_df))]


print(f"\nSuccessfully parsed all Gemini batch outputs. Total results: {len(gemini_df)}")
print(gemini_df.head())


# --- 6. Merge Gemini Labels with Ground Truth ---
student_df_full['SentenceID'] = student_df_full['SentenceID'].astype(str)
gemini_df['SentenceID'] = gemini_df['SentenceID'].astype(str)

if gemini_df['SentenceID'].duplicated().any():
    print("Warning: Gemini's combined output contains duplicate SentenceIDs. Keeping first occurrence.")
    gemini_df = gemini_df.drop_duplicates(subset=['SentenceID'], keep='first')

merged_df = pd.merge(student_df_full, gemini_df, on="SentenceID", how="left")

for col in ['Gemini_Positive_Topics', 'Gemini_Negative_Topics']:
    merged_df[col] = merged_df[col].apply(lambda x: x if isinstance(x, list) else [])

# --- 7. Prepare for Classification Report ---
def preprocess_topics_from_list(topic_list_series, all_known_topics):
    processed_output = []
    for topic_list in topic_list_series:
        if isinstance(topic_list, list):
            valid_topics = [
                str(t).strip() for t in topic_list
                if isinstance(t, str) and str(t).strip() in all_known_topics
            ]
            processed_output.append(valid_topics)
        else:
            processed_output.append([])
    return processed_output

def human_str_to_list(topic_series, all_known_topics):
    processed_list = []
    for item in topic_series.fillna(""):
        # If already a list, use it directly
        if isinstance(item, list):
            topics = [str(t).strip() for t in item if str(t).strip()]
        # If it's a string representation of a list, safely parse it
        elif isinstance(item, str) and item.strip().startswith("[") and item.strip().endswith("]"):
            try:
                parsed = eval(item)
                if isinstance(parsed, list):
                    topics = [str(t).strip() for t in parsed if str(t).strip()]
                else:
                    topics = [str(parsed).strip()] if str(parsed).strip() else []
            except Exception:
                topics = [item.strip()] if item.strip() else []
        elif pd.isna(item) or str(item).lower() == "none" or str(item).strip() == "":
            topics = []
        else:
            topics = [t.strip() for t in str(item).split(';') if t.strip()]
        valid_topics = [t for t in topics if t in all_known_topics]
        processed_list.append(valid_topics)
    return processed_list

mlb = MultiLabelBinarizer(classes=standardized_topic_list)

human_pos_topics_list = human_str_to_list(merged_df['Human_Positive_Topics'], standardized_topic_list)
human_neg_topics_list = human_str_to_list(merged_df['Human_Negative_Topics'], standardized_topic_list)
y_human_pos = mlb.fit_transform(human_pos_topics_list)
y_human_neg = mlb.transform(human_neg_topics_list)

gemini_pos_topics_list = preprocess_topics_from_list(merged_df['Gemini_Positive_Topics'], standardized_topic_list)
gemini_neg_topics_list = preprocess_topics_from_list(merged_df['Gemini_Negative_Topics'], standardized_topic_list)
y_gemini_pos = mlb.transform(gemini_pos_topics_list)
y_gemini_neg = mlb.transform(gemini_neg_topics_list)

# --- 8. Generate Classification Reports ---
print("\n--- Classification Report for POSITIVE Topics ---")

# y_human_pos and y_gemini_pos have shape (n_samples, n_all_standardized_topics)
# mlb.classes_ is the list of all standardized topics (length n_all_standardized_topics)

# Determine which classes (column indices) are active (have at least one true or predicted label)
active_pos_class_indices = [
    i for i, active in enumerate((y_human_pos.sum(axis=0) + y_gemini_pos.sum(axis=0)) > 0) if active
]

if not active_pos_class_indices:
    print("No positive topics found in either human or Gemini labels. Skipping positive report or reporting on all.")
    active_pos_class_indices = list(range(len(mlb.classes_))) # All indices from 0 to N-1
    if not active_pos_class_indices: # If mlb.classes_ was also empty (edge case)
         report_pos_str = "No topics defined in MLB, cannot generate report."
         print(report_pos_str)
    else:
        active_pos_target_names = mlb.classes_
        report_pos_str = classification_report(
            y_human_pos,
            y_gemini_pos,
            labels=active_pos_class_indices, 
            target_names=active_pos_target_names,
            zero_division=0,
            output_dict=False
        )
        print(report_pos_str)
else:
    # Get the names for these active classes from the original mlb.classes_
    active_pos_target_names = [mlb.classes_[i] for i in active_pos_class_indices]

    print(f"Debug: Number of active positive class indices: {len(active_pos_class_indices)}")
    print(f"Debug: Number of active positive target names: {len(active_pos_target_names)}")

    report_pos_str = classification_report(
        y_human_pos,
        y_gemini_pos,
        labels=active_pos_class_indices,  # These are the column indices to report on
        target_names=active_pos_target_names, # Names corresponding to these indices
        zero_division=0,
        output_dict=False
    )
    print(report_pos_str)


print("\n--- Classification Report for NEGATIVE Topics ---")
# Similar logic for negative topics
active_neg_class_indices = [
    i for i, active in enumerate((y_human_neg.sum(axis=0) + y_gemini_neg.sum(axis=0)) > 0) if active
]

if not active_neg_class_indices:
    print("No negative topics found in either human or Gemini labels. Skipping negative report or reporting on all.")
    active_neg_class_indices = list(range(len(mlb.classes_)))
    if not active_neg_class_indices:
        report_neg_str = "No topics defined in MLB, cannot generate report."
        print(report_neg_str)
    else:
        active_neg_target_names = mlb.classes_
        report_neg_str = classification_report(
            y_human_neg,
            y_gemini_neg,
            labels=active_neg_class_indices,
            target_names=active_neg_target_names,
            zero_division=0,
            output_dict=False
        )
        print(report_neg_str)
else:
    active_neg_target_names = [mlb.classes_[i] for i in active_neg_class_indices]

    print(f"Debug: Number of active negative class indices: {len(active_neg_class_indices)}")
    print(f"Debug: Number of active negative target names: {len(active_neg_target_names)}")

    report_neg_str = classification_report(
        y_human_neg,
        y_gemini_neg,
        labels=active_neg_class_indices,
        target_names=active_neg_target_names,
        zero_division=0,
        output_dict=False
    )
    print(report_neg_str)

# --- 9. Save Output ---
def list_to_str(lst):
    if isinstance(lst, list):
        return ";".join(sorted(list(set(lst)))) # Sort and unique for consistent output
    return ""

df_to_save = merged_df.copy()
df_to_save['Gemini_Positive_Topics_Str'] = df_to_save['Gemini_Positive_Topics']
df_to_save['Gemini_Negative_Topics_Str'] = df_to_save['Gemini_Negative_Topics']
output_columns = [
    'SentenceID', 'SentenceText',
    'Human_Positive_Topics', 'Human_Negative_Topics',
    'Gemini_Positive_Topics_Str', 'Gemini_Negative_Topics_Str'
]
df_to_save = df_to_save[output_columns]

try:
    df_to_save.to_csv(OUTPUT_CSV_WITH_GEMINI_LABELS, index=False)
    print(f"\nSuccessfully saved Gemini's labels and comparison to '{OUTPUT_CSV_WITH_GEMINI_LABELS}'")
except Exception as e:
    print(f"Error saving output CSV: {e}")

print("\n--- Script Finished ---")

Loaded 64 standardized topics: ['AI for Healthcare', 'Antenna Design', 'Application Development', 'Artificial Intelligence', 'Augmented Reality']...
Processing in 4 batches of size up to 50.

--- Processing Batch 1/4 (50 sentences) ---
Attempt 1/3 for batch 1...
Successfully processed batch 1. Received 50 results.
Waiting 2s before next batch...

--- Processing Batch 2/4 (50 sentences) ---
Attempt 1/3 for batch 2...
Successfully processed batch 2. Received 50 results.
Waiting 2s before next batch...

--- Processing Batch 3/4 (50 sentences) ---
Attempt 1/3 for batch 3...
Successfully processed batch 3. Received 50 results.
Waiting 2s before next batch...

--- Processing Batch 4/4 (30 sentences) ---
Attempt 1/3 for batch 4...
Successfully processed batch 4. Received 30 results.

Successfully parsed all Gemini batch outputs. Total results: 180
  SentenceID                             Gemini_Positive_Topics  \
0          1         [Artificial Intelligence, Neural Networks]   
1          2 

# OPTIMAL MATCHING BEGINS HERE

## Settting up dataframes

In [4]:
import random
import pandas as pd

# Import supervisors data and process topics as lists
supervisors_df = pd.read_csv('data\\supervisors_standardised_gemini.csv')

# Update supervisors_df to ensure it has the required columns
if 'supervisor_id' not in supervisors_df.columns:
    supervisors_df['supervisor_id'] = range(1, len(supervisors_df) + 1)

if 'capacity' not in supervisors_df.columns:
    supervisors_df['capacity'] = [random.randint(3, 10) for _ in range(len(supervisors_df))]

# Generate random students data
students_df = pd.read_csv('data\\gemini_labeled_preferences.csv')
students_df['student_id'] = students_df['SentenceID'].apply(lambda x: f'student_{x}')  # Create unique student IDs
students_df['programme'] = students_df['student_id'].apply(
    lambda x: random.choice(['BCS', 'BSE', 'BIT', 'BSDA', 'BCNS'])
)

standardized_topics = pd.read_csv('data\\unique_standardised_topics.csv')['standardised Topic'].to_list()

def preprocess_topics_from_list(topic_list_series, all_known_topics):
    processed_output = []
    for topic_list in topic_list_series:
        if isinstance(topic_list, list):
            valid_topics = [
                str(t).strip() for t in topic_list
                if isinstance(t, str) and str(t).strip() in all_known_topics
            ]
            processed_output.append(valid_topics)
        else:
            processed_output.append([])
    return processed_output

def human_str_to_list(topic_series, all_known_topics):
    processed_list = []
    for item in topic_series.fillna(""):
        # If already a list, use it directly
        if isinstance(item, list):
            topics = [str(t).strip() for t in item if str(t).strip()]
        # If it's a string representation of a list, safely parse it
        elif isinstance(item, str) and item.strip().startswith("[") and item.strip().endswith("]"):
            try:
                parsed = eval(item)
                if isinstance(parsed, list):
                    topics = [str(t).strip() for t in parsed if str(t).strip()]
                else:
                    topics = [str(parsed).strip()] if str(parsed).strip() else []
            except Exception:
                topics = [item.strip()] if item.strip() else []
        elif pd.isna(item) or str(item).lower() == "none" or str(item).strip() == "":
            topics = []
        else:
            topics = [t.strip() for t in str(item).split(';') if t.strip()]
        valid_topics = [t for t in topics if t in all_known_topics]
        processed_list.append(valid_topics)
    return processed_list

def safe_list(val):
    if isinstance(val, list):
        return val
    if isinstance(val, float) or pd.isna(val):
        return []
    if isinstance(val, str):
        try:
            # Try to parse stringified list
            if val.strip().startswith("[") and val.strip().endswith("]"):
                parsed = eval(val)
                if isinstance(parsed, list):
                    return [str(t).strip() for t in parsed if str(t).strip()]
            # Otherwise, split by comma or semicolon
            return [t.strip() for t in val.split(',') if t.strip()]
        except Exception:
            return [val.strip()] if val.strip() else []
    return []

# Display the datasets
print("Students Dataset:")
print(students_df.head())
print("\nSupervisors Dataset:")
print(supervisors_df.head())

Students Dataset:
   SentenceID                                       SentenceText  \
0           1  I'm really passionate about developing intelli...   
1           2  For my capstone project, I'm drawn to creating...   
2           3  I'm eager to work on sustainable computing sol...   
3           4  My interests lie in developing mobile apps and...   
4           5  I'm fascinated by computer vision applications...   

                               Human_Positive_Topics  \
0     ['Artificial Intelligence', 'Neural Networks']   
1  ['Virtual Reality', 'Augmented Reality', 'Exte...   
2              ['Green Computing', 'Sustainability']   
3  ['Mobile Application Development', 'UI/UX Desi...   
4            ['Computer Vision', 'Image Processing']   

                               Human_Negative_Topics  \
0     ['Distributed Systems', 'Computer Networking']   
1         ['Fiber Optic Sensor', 'Photonic Devices']   
2  ['Blockchain', 'Cybersecurity', 'Information S...   
3  ['Big Dat

## Linear Programming

In [12]:
from pulp import LpProblem, LpVariable, LpMaximize, lpSum, LpBinary

def optimal_matching(students_df, supervisors_df, balancing_penalty_weight=0.5):
    
    # Create the optimization problem
    problem = LpProblem("Optimal_Matching", LpMaximize)

    # Create decision variables for each student-supervisor pair
    decision_vars = {}
    for _, student in students_df.iterrows():
        for _, supervisor in supervisors_df.iterrows():
            decision_vars[(student['student_id'], supervisor['supervisor_id'])] = LpVariable(
                f"x_{student['student_id']}_{supervisor['supervisor_id']}", 0, 1, LpBinary
            )

    # --- Soft Balancing Setup ---
    num_students_total = len(students_df)
    num_supervisors_total = len(supervisors_df)

    if num_supervisors_total == 0: # Avoid division by zero
        target_load_per_supervisor = 0
    else:
        target_load_per_supervisor = num_students_total / num_supervisors_total

    print(f"Target load per supervisor (for soft balancing): {target_load_per_supervisor:.2f}")

    # Auxiliary variables for deviation from target load
    supervisor_over_target = LpVariable.dicts(
        "SupervisorOverTarget",
        [s['supervisor_id'] for _, s in supervisors_df.iterrows()],
        lowBound=0,
        cat='Continuous'
    )
    supervisor_under_target = LpVariable.dicts(
        "SupervisorUnderTarget",
        [s['supervisor_id'] for _, s in supervisors_df.iterrows()],
        lowBound=0,
        cat='Continuous'
    )

    # Constraints linking actual load to deviation variables
    for _, supervisor in supervisors_df.iterrows():
        supervisor_id = supervisor['supervisor_id']
        actual_load_expr = lpSum(decision_vars[(student['student_id'], supervisor_id)]
                                for _, student in students_df.iterrows())
        
        problem += (
            actual_load_expr - target_load_per_supervisor ==
            supervisor_over_target[supervisor_id] - supervisor_under_target[supervisor_id],
            f"Define_Deviation_Supervisor_{supervisor_id}"
        )

    # Objective function with prioritized programme preferences
    problem += (
        lpSum(
            decision_vars[(student['student_id'], supervisor['supervisor_id'])] * (
                # Programme preference weighting (higher weights to prioritize)
                (10 if student.get('programme', '') in supervisor['Preferred Programme for Supervision (1st Choice)']  or "No Preference" in supervisor['Preferred Programme for Supervision (1st Choice)'] else
                5 if student.get('programme', '') in supervisor['Preferred Programme for Supervision (2nd Choice)'] or "No Preference" in supervisor['Preferred Programme for Supervision (2nd Choice)'] else 0) +
                # Topic preference weighting (lower weights relative to programme)
                (2 * sum(1 for topic in safe_list(student['Gemini_Positive_Topics_Str'])
                        if topic in safe_list(supervisor['standardised Topics']))) -
                1 * sum(1 for topic in safe_list(student['Gemini_Negative_Topics_Str'])
                        if topic in safe_list(supervisor['standardised Topics']))
            )
            for _, student in students_df.iterrows()
            for _, supervisor in supervisors_df.iterrows()
            
        )
        # Penalty: discourage supervisors from having too many students
        - balancing_penalty_weight * lpSum(
            supervisor_over_target[s['supervisor_id']] + supervisor_under_target[s['supervisor_id']]
            for _, s in supervisors_df.iterrows()
        )
    )

    # Constraint: Each student is assigned to exactly one supervisor
    for _, student in students_df.iterrows():
        problem += lpSum(
            decision_vars[(student['student_id'], supervisor['supervisor_id'])]
            for _, supervisor in supervisors_df.iterrows()
        ) == 1

    # Constraint: Each supervisor does not exceed their capacity
    for _, supervisor in supervisors_df.iterrows():
        capacity = supervisor.get('capacity', 5)  # Default capacity of 5
        problem += lpSum(
            decision_vars[(student['student_id'], supervisor['supervisor_id'])]
            for _, student in students_df.iterrows()
        ) <= capacity

    # Solve the problem
    problem.solve()

    # Extract and display results with detailed matching information
    assignments = []
    for _, student in students_df.iterrows():
        for _, supervisor in supervisors_df.iterrows():
            if decision_vars[(student['student_id'], supervisor['supervisor_id'])].value() == 1:
                programme_match_type = (
                    "First Choice" if (student.get('programme', '') in supervisor['Preferred Programme for Supervision (1st Choice)'] or "No Preference" in supervisor['Preferred Programme for Supervision (1st Choice)']) else
                    "Second Choice" if (student.get('programme', '') in supervisor['Preferred Programme for Supervision (2nd Choice)'] or "No Preference" in supervisor['Preferred Programme for Supervision (2nd Choice)']) else
                    "No Match"
                )
                matching_topics = [topic for topic in safe_list(student['Gemini_Positive_Topics_Str'])
                                if topic in safe_list(supervisor['standardised Topics'])]
                conflicting_topics = [topic for topic in safe_list(student['Gemini_Negative_Topics_Str'])
                                    if topic in safe_list(supervisor['standardised Topics'])]
                assignments.append({
                    'student_id': student['student_id'],
                    'supervisor_id': supervisor['supervisor_id'],
                    'supervisor_name': supervisor['Name'],
                    'programme_match': programme_match_type,
                    'matching_topics': matching_topics,
                    'conflicting_topics': conflicting_topics,
                    'match_score': (
                        10 if programme_match_type == "First Choice" else
                        5 if programme_match_type == "Second Choice" else
                        0
                    ) + (2 * len(matching_topics)) - len(conflicting_topics)
                })

    # Convert assignments to DataFrame for better display
    assignments_df = pd.DataFrame(assignments)
    print("\nOptimal Assignments:")
    print(assignments_df)

    # Calculate and display statistics
    print("\nAssignment Statistics:")
    print(f"Total assignments: {len(assignments)}")
    print("\nProgramme Matching Distribution:")
    print(assignments_df['programme_match'].value_counts())
    print(f"\nAverage matching topics: {assignments_df['matching_topics'].apply(len).mean():.2f}")
    print(f"Average conflicting topics: {assignments_df['conflicting_topics'].apply(len).mean():.2f}")
    print(f"Average match score: {assignments_df['match_score'].mean():.2f}")
    print(f"Standard Deviation of match scores: {assignments_df['match_score'].std():.2f}")

    # Save the results to a CSV file
    assignments_output_path = f"results\\OMA\\optimal_student_supervisor_assignments_{balancing_penalty_weight}.csv"
    assignments_df.to_csv(assignments_output_path, index=False)
    print(f"\nOptimal assignments saved to: {assignments_output_path}")

    # Analyse how many students were assigned to each supervisor
    supervisor_assignments = assignments_df.groupby('supervisor_id').size().reset_index(name='assigned_students_count')
    supervisor_assignments = supervisor_assignments.merge(supervisors_df[['supervisor_id', 'Name']], on='supervisor_id', how='left')
    # Average number of students assigned per supervisor
    average_students_per_supervisor = supervisor_assignments['assigned_students_count'].mean()
    print(f"\nAverage number of students assigned per supervisor: {average_students_per_supervisor:.2f}")
    print(f"Standard Deviation of students assigned per supervisor: {supervisor_assignments['assigned_students_count'].std():.2f}")
    print("\nSupervisor Assignments Count:")
    print(supervisor_assignments)

    match_counts = assignments_df['programme_match'].value_counts()

    # Save statistics to a CSV file
    stats_file = f"results/OMA/supervisor_assignments_statistics.csv"
    # Use the average match score, standard deviation, and average students per supervisor
    new_row = pd.DataFrame({
        'Balancing Penalty Weight': [balancing_penalty_weight],
        'Total Assignments': [len(assignments)],
        'Total Match Score': [assignments_df['match_score'].sum()],
        'Average Match Score': [assignments_df['match_score'].mean()],
        'Standard Deviation of Match Scores': [assignments_df['match_score'].std()],
        'Average Students per Supervisor': [average_students_per_supervisor],
        'Standard Deviation of Students per Supervisor': [supervisor_assignments['assigned_students_count'].std()],
        'First Choice': [match_counts.get("First Choice", 0)],
        'Second Choice': [match_counts.get("Second Choice", 0)],
        'No Match': [match_counts.get("No Match", 0)],
    })
    # Add dataframe to CSV, append the results
    try:
        stats_df = pd.read_csv(stats_file)
        stats_df = pd.concat([stats_df, new_row], ignore_index=True)
    except FileNotFoundError:
        stats_df = new_row
    
    stats_df.to_csv(stats_file, index=False)
    print(f"\nSupervisor assignments statistics saved to: {stats_file}")
    
balancing_penalty_weight = [10, 100, 1000, 10000, 100000]
for weight in balancing_penalty_weight:
    print(f"\nRunning optimal matching with balancing penalty weight: {weight}")
    optimal_matching(students_df, supervisors_df, weight)



Running optimal matching with balancing penalty weight: 10
Target load per supervisor (for soft balancing): 3.40

Optimal Assignments:
      student_id  supervisor_id                            supervisor_name  \
0      student_1             27                   Dr Brandon Khoo Boo Guan   
1      student_2             16                    Nurul Aiman Abdul Rahim   
2      student_3             39                         Prof. Lau Sian Lun   
3      student_4              8                     Dr Faris Syahmi Samidi   
4      student_5             27                   Dr Brandon Khoo Boo Guan   
..           ...            ...                                        ...   
175  student_176             21  Assoc. Prof. Dr Sami Salama Hussen Hajjaj   
176  student_177             40                      Prof. Serge Demidenko   
177  student_178             24                         Dr Aaliya Sarfaraz   
178  student_179             25                 Dr Ahmad Sahban Rafsanjani   
179  s