# USE GEMINI TO LABEL STUDENT PREFERENCES ACCORDING TO TOPICS

## Import supervisor list and remove duplicate topics

In [None]:
import csv
import pandas as pd

def import_supervisors(file_path):
    supervisors = []
    try:
        with open(file_path, mode='r', encoding='utf-8') as file:
            csv_reader = csv.DictReader(file)
            for row in csv_reader:
                supervisors.append(row)
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
    return supervisors  


def supervisors_to_dataframe(supervisors_csv):
    try:
        df = pd.DataFrame(supervisors_csv)
        return df
    except Exception as e:
        print(f"An error occurred while converting to DataFrame: {e}")
        return None 
    

def combine_expertise_topics(row, expertise_columns):
    """Helper function to combine topics from multiple expertise areas"""
    all_topics = []
    for col in expertise_columns:
        if row[col]:
            # Handle if input is already a list or string
            topics = row[col] if isinstance(row[col], list) else eval(str(row[col]))
            # Clean each topic in the list
            cleaned_topics = [t.strip() for t in topics if t.strip()]
            all_topics.extend(cleaned_topics)
    # Remove duplicates while preserving order
    unique_topics = list(dict.fromkeys(all_topics))
    return ', '.join(unique_topics)

# Generate supervisor ID and a randomised capacity
# Add a 'topics' column that is baed on the 'Expertise Area 1', 'Expertise Area 2', and 'Expertise Area 3' columns
def generate_supervisor_data(supervisors_df):
    if supervisors_df is None or supervisors_df.empty:
        print("No data to process.")
        return None

    supervisors_df['supervisor_id'] = range(1, len(supervisors_df) + 1)
    
    import random
    supervisors_df['capacity'] = [random.randint(3, 10) for _ in range(len(supervisors_df))]
    
    expertise_columns = ['Expertise Area 1', 'Expertise Area 2', 'Expertise Area 3']
    supervisors_df['topics'] = supervisors_df.apply(
        lambda x: combine_expertise_topics(x, expertise_columns), 
        axis=1
    )
    
    return supervisors_df


## Use Gemini to standardise supervisor topics for easier labeling

In [None]:
import google.generativeai as genai
import json
import os
from IPython.display import display, Markdown # For better display in notebooks
from ast import literal_eval

os.environ['GOOGLE_API_KEY'] = 'AIzaSyBr8aF6h4Vp1LpwxbKtD8KvuaCfUcl-2MM'

# --- Configuration ---
try:
    # Attempt to configure from environment variable
    if "GOOGLE_API_KEY" not in os.environ:
        print("Warning: GOOGLE_API_KEY environment variable not set.")
    genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
except Exception as e:
    print(f"Error configuring Gemini API: {e}")
    print("Please ensure your GOOGLE_API_KEY is correctly set.")
    exit(1)

try:
    model = genai.GenerativeModel('gemini-2.0-flash')
except Exception as e:
    print(f"Error initializing Gemini model: {e}")
    model = None


# --- Helper Functions ---
def extract_unique_expertise_terms(df, expertise_cols):
    """Extracts all unique, non-empty expertise terms from specified columns."""
    all_terms = set()
    for col in expertise_cols:
        # Ensure column exists and handle potential errors if it doesn't
        if col in df.columns:
            # Drop NaNs
            col_data = df[col].dropna()
            for item in col_data:
                # If the cell is a list, extend; if string, treat as single topic
                if isinstance(item, list):
                    all_terms.update([t.strip() for t in item if t and str(t).strip()])
                else:
                    # Try to parse string representation of list, else treat as single string
                    try:
                        parsed = eval(item) if isinstance(item, str) and item.startswith("[") else item
                        if isinstance(parsed, list):
                            all_terms.update([t.strip() for t in parsed if t and str(t).strip()])
                        else:
                            if str(parsed).strip():
                                all_terms.add(str(parsed).strip())
                    except Exception:
                        if str(item).strip():
                            all_terms.add(str(item).strip())
        else:
            print(f"Warning: Column '{col}' not found in DataFrame.")
    return sorted(list(all_terms))

def get_standardisation_map_from_gemini(unique_terms_list):
    """
    Sends a list of unique expertise terms to Gemini and asks for a standardisation map.
    Returns a dictionary: {"original_term": "standardised_term"}.
    """
    if not model:
        print("Gemini model not initialized. Cannot proceed.")
        return None
    if not unique_terms_list:
        print("No unique terms provided to standardise.")
        return {}

    prompt = f"""
        You are an expert academic research field categorizer and data normalizer.
        I have a list of expertise areas extracted from a dataset of supervisors.
        Many of these terms are variations of the same concept (e.g., "IoT", "Internet of Things", "Industrial IoT")
        or very closely related.

        Your task is to analyze the following list of unique expertise terms and create a JSON object
        that maps each original term to a single, consistent, standardised "umbrella" term.
        Your aim is to reduce redundancy and ensure that similar or synonymous terms are grouped under 
        a single standardised term to be used for labeling and categorization of student's preferences in a university database.

        Guidelines:
        1. The standardised term should be a concise and commonly understood representation of the concept.
        2. If an original term is already a good standard, it can map to itself.
        3. Group synonymous or similar terms under ONE standardised term. For example, if "Machine Learning", "ML", and "Deep Learning" are present, they might all map to "Machine Learning" or you might decide "Deep Learning" should map to "Deep Learning" if it's distinct enough, while "ML" maps to "Machine Learning". Use your best judgment to create meaningful umbrella terms.
        4. The output MUST be a single JSON object where keys are the *original* expertise terms from the input list, and values are their corresponding *standardised* umbrella terms. Every term from the input list must be a key in the output JSON.
        5. Do not include any explanatory text outside the JSON object. Just the JSON.

        List of unique expertise terms:
        {json.dumps(unique_terms_list)}

        Please provide the JSON mapping:
        """

    print("Sending request to Gemini API...")
    try:
        response = model.generate_content(prompt)
        # Gemini API can sometimes wrap JSON in markdown backticks
        cleaned_response_text = response.text.strip().removeprefix("```json").removeprefix("```").removesuffix("```").strip()

        # Validate and parse JSON
        try:
            standardisation_map = json.loads(cleaned_response_text)
            # Basic validation: ensure it's a dict
            if not isinstance(standardisation_map, dict):
                print("Error: Gemini did not return a valid JSON dictionary.")
                print("Raw response:", response.text)
                return None
            # Ensure all original terms are keys
            missing_keys = [term for term in unique_terms_list if term not in standardisation_map]
            if missing_keys:
                print(f"Warning: Gemini's map is missing keys for: {missing_keys}")
                for key in missing_keys:
                    standardisation_map[key] = key # self-mapping
            return standardisation_map
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from Gemini: {e}")
            print("Raw response text from Gemini:")
            print(response.text) # print the raw response for debugging
            return None
    except Exception as e:
        print(f"Error calling Gemini API: {e}")
        if hasattr(e, 'response') and e.response: # More detailed error if available
            print(f"Gemini API Error Details: {e.response}")
        return None

# --- Main Processing ---

# 1. Load CSV
csv_file_path = 'data\\supervisors_list.csv' # <--- CHANGE FILENAME
expertise_columns = ['Expertise Area 1', 'Expertise Area 2', 'Expertise Area 3']

try:
    supervisors_df = pd.read_csv(csv_file_path)
    for col in expertise_columns:
        if col not in supervisors_df.columns:
            print(f"Warning: Column '{col}' not found in CSV. Skipping standardisation for this column.")
            expertise_columns.remove(col)
        else:
            # Ensure expertise columns are treated as lists
            supervisors_df[col] = supervisors_df[col].apply(literal_eval)
except FileNotFoundError:
    print(f"Error: '{csv_file_path}' not found. Using dummy data for demonstration.")
    data = {
        'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
        'Department': ['CS', 'CS', 'AI', 'CS', 'EE'],
        'Preferred Programme for Supervision (1st Choice)': ['PhD CS', 'MSc AI', 'PhD AI', 'MSc DS', 'PhD EE'],
        'Preferred Programme for Supervision (2nd Choice)': ['MSc AI', 'PhD CS', 'MSc DS', 'PhD CS', 'MSc CS'],
        'Expertise Area 1': ['Machine Learning', 'Software Architecture', 'Natural Language Processing', 'Data Mining', 'IoT'],
        'Expertise Area 2': ['Deep Learning', 'Agile Development', pd.NA, 'Big Data Analytics', 'Internet of Things'],
        'Expertise Area 3': ['Computer Vision', pd.NA, 'Ethics in AI', 'Cloud Computing', 'Industrial IoT']
    }
    supervisors_df = pd.DataFrame(data)

print("Original DataFrame sample:")
display(supervisors_df.head())

# 2. Extract All Unique Expertise Terms
unique_terms = extract_unique_expertise_terms(supervisors_df, expertise_columns)
if not unique_terms:
    print("No expertise terms found to process. Exiting.")
    exit()
else:
    print(f"\nFound {len(unique_terms)} unique expertise terms to standardise:")
    print(unique_terms)

    # 3. Get standardisation Map from Gemini (only if model initialized and terms exist)
    standardisation_dictionary = None
    if model and unique_terms:
        standardisation_dictionary = get_standardisation_map_from_gemini(unique_terms)

    if standardisation_dictionary:
        print("\n--- standardisation Map from Gemini (Review this carefully!) ---")
        # Pretty print the dictionary for review
        display(Markdown("```json\n" + json.dumps(standardisation_dictionary, indent=2) + "\n```"))

        # --- Maybe include manual review here? ---

        # 4. Apply Mapping to Create standardised Expertise Columns
        print("\nApplying standardisation map to DataFrame...")
        for i, col_name in enumerate(expertise_columns):
            if col_name in supervisors_df.columns:
                standardised_col_name = f'standardised Expertise {i+1}'
                supervisors_df[standardised_col_name] = supervisors_df[col_name].apply(
                    lambda topics: [standardisation_dictionary.get(t.strip(), t.strip()) for t in topics] if isinstance(topics, list)
                    else [standardisation_dictionary.get(str(topics), str(topics))] if pd.notna(topics) and str(topics).strip()
                    else []
                )
                print(supervisors_df[standardised_col_name])
            else:
                print(f"Skipping standardisation for non-existent column: {col_name}")


        # 5. Combine standardised Expertise into a single columnI
        standardised_expertise_cols = [f'standardised Expertise {i+1}' for i in range(len(expertise_columns)) if f'standardised Expertise {i+1}' in supervisors_df.columns]

        if standardised_expertise_cols: # only proceed if standardised columns were created
            supervisors_df['standardised Topics'] = supervisors_df.apply(
                lambda x: combine_expertise_topics(x, standardised_expertise_cols),
                axis=1
            )

            print("\nDataFrame with standardised Expertise:")
            display(supervisors_df[['Name'] + expertise_columns + standardised_expertise_cols + ['standardised Topics']].head())

            # 6. Save Outputs
            # Save the standardisation map to a JSON file
            map_output_path = 'data\\gemini_standardisation_map.json'
            with open(map_output_path, 'w') as f:
                json.dump(standardisation_dictionary, f, indent=4)
            print(f"\nstandardisation map saved to: {map_output_path}")

            # Save the augmented DataFrame to CSV
            csv_output_path = 'data\\supervisors_standardised_gemini.csv'
            supervisors_df.to_csv(csv_output_path, index=False)
            print(f"Augmented DataFrame saved to CSV: {csv_output_path}")

            # Example: Further manipulation - unique standardised topics
            if 'standardised Topics' in supervisors_df.columns:
                unique_standardised_topics_list = supervisors_df['standardised Topics'].str.split(', ').explode().str.strip()
                unique_standardised_topics_list = unique_standardised_topics_list[unique_standardised_topics_list != ''].unique()
                print("\nUnique individual standardised topic terms found across all supervisors:")
                print(sorted(list(unique_standardised_topics_list)))

                # Save unique standardised topics to CSV
                unique_topics_df = pd.DataFrame({'standardised Topic': sorted(list(unique_standardised_topics_list))})
                unique_topics_df.to_csv('data\\unique_standardised_topics.csv', index=False)
                print("Unique standardised topics saved to: data\\unique_standardised_topics.csv")
        else:
            print("\nNo standardised expertise columns were created. Skipping combination and saving of DataFrame.")

    else:
        print("\nFailed to get standardisation map from Gemini. No changes applied to DataFrame.")

## Test labeling accuracy

In [13]:
import google.generativeai as genai
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
import os
import io
import json
import time
import math

# --- Configuration ---
GEMINI_MODEL_NAME = "gemini-2.0-flash"
STUDENT_PREFERENCES_CSV = "data\\cgpt_sentences.csv"
STANDARDIZED_TOPICS_CSV = "data\\unique_standardised_topics.csv"
OUTPUT_CSV_WITH_GEMINI_LABELS = "data\\gemini_labeled_preferences.csv"
API_RETRY_LIMIT = 3
API_RETRY_DELAY_SECONDS = 5
BATCH_SIZE = 50
DELAY_BETWEEN_BATCHES_SECONDS = 2
os.environ['GOOGLE_API_KEY'] = 'AIzaSyBr8aF6h4Vp1LpwxbKtD8KvuaCfUcl-2MM'

# --- 1. Configure Gemini API ---
try:
    api_key = os.getenv("GOOGLE_API_KEY")
    if not api_key:
        raise ValueError("GOOGLE_API_KEY environment variable not set.")
    genai.configure(api_key=api_key)
except Exception as e:
    print(f"Error configuring Gemini API: {e}")
    exit()

# --- 2. Load Data ---
try:
    student_df_full = pd.read_csv(STUDENT_PREFERENCES_CSV)
    topics_df = pd.read_csv(STANDARDIZED_TOPICS_CSV)
except FileNotFoundError as e:
    print(f"Error: {e}. Make sure your CSV files are in the correct path.")
    exit()
except pd.errors.EmptyDataError as e:
    print(f"Error: {e}. One of your CSV files might be empty.")
    exit()


# Ensure expected columns exist in student_df
required_cols = ['Entry', 'Positive_Topics', 'Negative_Topics']
if not all(col in student_df_full.columns for col in required_cols):
    print(f"Error: {STUDENT_PREFERENCES_CSV} must contain columns: {', '.join(required_cols)}")
    exit()

student_df_full.rename(columns={
    'Entry': 'SentenceText',
    'Positive_Topics': 'Human_Positive_Topics',
    'Negative_Topics': 'Human_Negative_Topics'
}, inplace=True)

# Add sentence ID if not present
if 'SentenceID' not in student_df_full.columns:
    student_df_full['SentenceID'] = student_df_full.index + 1  # Create a simple ID based on index

# Get the list of standardized topics
if topics_df.empty or topics_df.columns.empty:
    print(f"Error: {STANDARDIZED_TOPICS_CSV} is empty or has no columns. It should have one column with topics.")
    exit()
standardized_topic_list = topics_df.iloc[:, 0].astype(str).str.strip().unique().tolist()
print(f"Loaded {len(standardized_topic_list)} standardized topics: {standardized_topic_list[:5]}...") # Print first 5

# --- 3. Helper function to create prompt for a batch ---
def create_prompt_for_batch(batch_sentences_list, all_standardized_topics):
    sentences_json_for_prompt = json.dumps(batch_sentences_list, indent=2)
    prompt = f"""
You are an expert AI assistant specialized in classifying student project preferences.
Your task is to label a list of student preference sentences with relevant project topics, both positive and negative.
You MUST use ONLY the topics from the provided standardized list.

Standardized Topics List:
{', '.join(all_standardized_topics)}

Input Sentences for this batch (as a JSON array of objects):
{sentences_json_for_prompt}

Instructions:
1.  For each sentence object in the input JSON array, analyze the "SentenceText".
2.  Identify topics the student expresses a POSITIVE preference for.
3.  Identify topics the student expresses a NEGATIVE preference for.
4.  Topics MUST be chosen EXACTLY from the 'Standardized Topics List' above. Do not invent new topics or use variations. IF you are not completely certain, label it as 'No Match'.
5.  Your output MUST be a valid JSON array of objects.
6.  Each object in your output array should correspond to an input sentence and have the following keys:
    *   "SentenceID": (string) The ID from the input sentence object.
    *   "Gemini_Positive_Topics": (array of strings) A list of positive topics. If no positive topics, label it as 'No Match'.
    *   "Gemini_Negative_Topics": (array of strings) A list of negative topics. If no negative topics, label it as 'No Match'.
7.  Ensure every SentenceID from the input batch is present in your output JSON array.
8.  Do NOT include the original 'SentenceText' in your output JSON, only the specified keys.

Example of expected output JSON format:
[
  {{
    "SentenceID": "S001",
    "Gemini_Positive_Topics": ["Machine Learning", "Artificial Intelligence"],
    "Gemini_Negative_Topics": ["Web Development"]
  }},
  {{
    "SentenceID": "S002",
    "Gemini_Positive_Topics": ["Data Science"],
    "Gemini_Negative_Topics": ["No Match"]
  }}
]

Begin your JSON output now (ensure it's a single, valid JSON array for this batch):
"""
    return prompt

# --- 4. Process Sentences in Batches ---
model = genai.GenerativeModel(GEMINI_MODEL_NAME)
all_gemini_results = [] # To store results from all batches

num_batches = math.ceil(len(student_df_full) / BATCH_SIZE)
print(f"Processing in {num_batches} batches of size up to {BATCH_SIZE}.")

for i in range(num_batches):
    start_index = i * BATCH_SIZE
    end_index = start_index + BATCH_SIZE
    batch_df = student_df_full.iloc[start_index:end_index]

    print(f"\n--- Processing Batch {i+1}/{num_batches} ({len(batch_df)} sentences) ---")

    if batch_df.empty:
        print("Batch is empty, skipping.")
        continue

    # Prepare list of sentences for the current batch's prompt
    batch_sentences_to_label_list = []
    for _, row in batch_df.iterrows():
        batch_sentences_to_label_list.append({
            "SentenceID": str(row['SentenceID']),
            "SentenceText": row['SentenceText']
        })

    batch_prompt = create_prompt_for_batch(batch_sentences_to_label_list, standardized_topic_list)

    gemini_output_json_str = None
    current_batch_results = None

    for attempt in range(API_RETRY_LIMIT):
        try:
            print(f"Attempt {attempt + 1}/{API_RETRY_LIMIT} for batch {i+1}...")
            response = model.generate_content(
                batch_prompt,
                generation_config=genai.types.GenerationConfig(
                    temperature=0.1
                )
            )
            if not response.parts:
                if response.prompt_feedback and response.prompt_feedback.block_reason:
                    print(f"Warning: Prompt for batch {i+1} was blocked. Reason: {response.prompt_feedback.block_reason}")
                else:
                    print(f"Warning: Gemini response for batch {i+1} has no parts.")
                if attempt < API_RETRY_LIMIT - 1:
                    print(f"Retrying batch {i+1} in {API_RETRY_DELAY_SECONDS} seconds...")
                    time.sleep(API_RETRY_DELAY_SECONDS)
                    continue
                else:
                    print(f"Max retries reached for problematic response for batch {i+1}. Skipping this batch.")
                    break # Break from retry loop for this batch

            gemini_output_json_str = response.text.strip()

            if gemini_output_json_str.startswith("```json"):
                gemini_output_json_str = gemini_output_json_str[len("```json"):].strip()
            if gemini_output_json_str.endswith("```"):
                gemini_output_json_str = gemini_output_json_str[:-len("```")].strip()

            first_char = gemini_output_json_str[0] if gemini_output_json_str else ''
            last_char = gemini_output_json_str[-1] if gemini_output_json_str else ''
            if not ((first_char == '[' and last_char == ']')):
                json_start_index = gemini_output_json_str.find('[')
                json_end_index = gemini_output_json_str.rfind(']')
                if json_start_index != -1 and json_end_index > json_start_index :
                    gemini_output_json_str = gemini_output_json_str[json_start_index : json_end_index+1]
                else:
                    raise ValueError("Could not reliably extract JSON array from Gemini response for this batch.")

            current_batch_results = json.loads(gemini_output_json_str)
            if not isinstance(current_batch_results, list):
                raise ValueError("Gemini's output for batch was not a JSON list as expected.")
            
            print(f"Successfully processed batch {i+1}. Received {len(current_batch_results)} results.")
            all_gemini_results.extend(current_batch_results)
            break # Successful processing of this batch

        except json.JSONDecodeError as e:
            print(f"Error parsing Gemini's JSON output for batch {i+1} (attempt {attempt+1}): {e}")
            print("Raw output snippet:", gemini_output_json_str[:200] if gemini_output_json_str else "None")
            if attempt < API_RETRY_LIMIT - 1:
                time.sleep(API_RETRY_DELAY_SECONDS)
            else:
                print(f"Failed to parse JSON for batch {i+1} after {API_RETRY_LIMIT} attempts. Skipping this batch.")
        except Exception as e:
            print(f"Error during Gemini API call or processing for batch {i+1} (attempt {attempt+1}): {e}")
            if hasattr(response, 'prompt_feedback') and response.prompt_feedback:
                 print(f"Prompt Feedback: {response.prompt_feedback}")
            if attempt < API_RETRY_LIMIT - 1:
                time.sleep(API_RETRY_DELAY_SECONDS)
            else:
                print(f"Failed to process batch {i+1} after {API_RETRY_LIMIT} attempts. Skipping this batch.")
    
    # Optional: Add a small delay between batch calls to be polite to the API
    if i < num_batches - 1: # Don't sleep after the last batch
        print(f"Waiting {DELAY_BETWEEN_BATCHES_SECONDS}s before next batch...")
        time.sleep(DELAY_BETWEEN_BATCHES_SECONDS)


if not all_gemini_results:
    print("\nNo results were successfully processed from Gemini. Exiting.")
    exit()

# --- 5. Convert All Gemini Results to DataFrame ---
gemini_df = pd.DataFrame(all_gemini_results)
if gemini_df.empty:
    print("\nGemini DataFrame is empty after processing all batches. Exiting.")
    exit()

expected_gemini_cols = ["SentenceID", "Gemini_Positive_Topics", "Gemini_Negative_Topics"]
if not all(col in gemini_df.columns for col in expected_gemini_cols):
    missing_cols = [col for col in expected_gemini_cols if col not in gemini_df.columns]
    print(f"Warning: Gemini's combined JSON output is missing columns: {', '.join(missing_cols)}. Will try to proceed.")
    # Fill missing columns with empty lists if they are essential for later steps
    for mc in missing_cols:
        if mc not in gemini_df.columns: # Check again, just in case
             gemini_df[mc] = [[] for _ in range(len(gemini_df))]


print(f"\nSuccessfully parsed all Gemini batch outputs. Total results: {len(gemini_df)}")
print(gemini_df.head())


# --- 6. Merge Gemini Labels with Ground Truth ---
student_df_full['SentenceID'] = student_df_full['SentenceID'].astype(str)
gemini_df['SentenceID'] = gemini_df['SentenceID'].astype(str)

if gemini_df['SentenceID'].duplicated().any():
    print("Warning: Gemini's combined output contains duplicate SentenceIDs. Keeping first occurrence.")
    gemini_df = gemini_df.drop_duplicates(subset=['SentenceID'], keep='first')

merged_df = pd.merge(student_df_full, gemini_df, on="SentenceID", how="left")

for col in ['Gemini_Positive_Topics', 'Gemini_Negative_Topics']:
    merged_df[col] = merged_df[col].apply(lambda x: x if isinstance(x, list) else [])

# --- 7. Prepare for Classification Report ---
def preprocess_topics_from_list(topic_list_series, all_known_topics):
    processed_output = []
    for topic_list in topic_list_series:
        if isinstance(topic_list, list):
            valid_topics = [
                str(t).strip() for t in topic_list
                if isinstance(t, str) and str(t).strip() in all_known_topics
            ]
            processed_output.append(valid_topics)
        else:
            processed_output.append([])
    return processed_output

def human_str_to_list(topic_series, all_known_topics):
    processed_list = []
    for item in topic_series.fillna(""):
        # If already a list, use it directly
        if isinstance(item, list):
            topics = [str(t).strip() for t in item if str(t).strip()]
        # If it's a string representation of a list, safely parse it
        elif isinstance(item, str) and item.strip().startswith("[") and item.strip().endswith("]"):
            try:
                parsed = eval(item)
                if isinstance(parsed, list):
                    topics = [str(t).strip() for t in parsed if str(t).strip()]
                else:
                    topics = [str(parsed).strip()] if str(parsed).strip() else []
            except Exception:
                topics = [item.strip()] if item.strip() else []
        elif pd.isna(item) or str(item).lower() == "none" or str(item).strip() == "":
            topics = []
        else:
            topics = [t.strip() for t in str(item).split(';') if t.strip()]
        valid_topics = [t for t in topics if t in all_known_topics]
        processed_list.append(valid_topics)
    return processed_list

mlb = MultiLabelBinarizer(classes=standardized_topic_list)

human_student_topic_satisfactions_list = human_str_to_list(merged_df['Human_Positive_Topics'], standardized_topic_list)
human_neg_topics_list = human_str_to_list(merged_df['Human_Negative_Topics'], standardized_topic_list)
y_human_pos = mlb.fit_transform(human_student_topic_satisfactions_list)
y_human_neg = mlb.transform(human_neg_topics_list)

gemini_student_topic_satisfactions_list = preprocess_topics_from_list(merged_df['Gemini_Positive_Topics'], standardized_topic_list)
gemini_neg_topics_list = preprocess_topics_from_list(merged_df['Gemini_Negative_Topics'], standardized_topic_list)
y_gemini_pos = mlb.transform(gemini_student_topic_satisfactions_list)
y_gemini_neg = mlb.transform(gemini_neg_topics_list)

# --- 8. Generate Classification Reports ---
output_filename = 'DELETE_THIS_reports.txt'

def generate_report_string(y_true, y_pred, mlb, title):
    """
    Generates a formatted string for a scikit-learn classification report,
    only including classes that are active in either the true or predicted labels.

    Args:
        y_true: The ground truth (correct) labels.
        y_pred: The predicted labels returned by the classifier.
        mlb: The fitted MultiLabelBinarizer instance.
        title (str): The title for this section of the report.

    Returns:
        str: A formatted string containing the classification report.
    """
    report_header = f"--- {title} ---\n"

    # Determine which classes (column indices) are active
    active_class_indices = [
        i for i, active in enumerate((y_true.sum(axis=0) + y_pred.sum(axis=0)) > 0) if active
    ]

    if not active_class_indices:
        return report_header + "No active topics found in either human or Gemini labels. Skipping report.\n"

    # Get the names for these active classes from the binarizer
    active_target_names = [mlb.classes_[i] for i in active_class_indices]

    # Generate the classification report as a string
    report_str = classification_report(
        y_true,
        y_pred,
        labels=active_class_indices,
        target_names=active_target_names,
        zero_division=0,
        output_dict=False  # Ensure output is a string
    )
    
    return report_header + report_str

report_pos_str = generate_report_string(
    y_human_pos,
    y_gemini_pos,
    mlb,
    "Classification Report for POSITIVE Topics"
)

report_neg_str = generate_report_string(
    y_human_neg,
    y_gemini_neg,
    mlb,
    "Classification Report for NEGATIVE Topics"
)

try:
    with open(output_filename, 'w') as f:
        print(f"Writing reports to '{output_filename}'...")
        
        # Write the positive report
        f.write(report_pos_str)
        
        # Add a clear separator between the two reports
        f.write("\n\n" + "="*80 + "\n\n")
        
        # Write the negative report
        f.write(report_neg_str)
        
    print("Successfully saved classification reports.")

except IOError as e:
    print(f"Error: Could not write to file '{output_filename}'. Reason: {e}")

print("\n" + report_pos_str)
print("\n" + report_neg_str)

# --- 9. Save Output ---
def list_to_str(lst):
    if isinstance(lst, list):
        return ";".join(sorted(list(set(lst)))) # Sort and unique for consistent output
    return ""

df_to_save = merged_df.copy()
df_to_save['Gemini_Positive_Topics_Str'] = df_to_save['Gemini_Positive_Topics']
df_to_save['Gemini_Negative_Topics_Str'] = df_to_save['Gemini_Negative_Topics']
output_columns = [
    'SentenceID', 'SentenceText',
    'Human_Positive_Topics', 'Human_Negative_Topics',
    'Gemini_Positive_Topics_Str', 'Gemini_Negative_Topics_Str'
]
df_to_save = df_to_save[output_columns]

try:
    df_to_save.to_csv(OUTPUT_CSV_WITH_GEMINI_LABELS, index=False)
    print(f"\nSuccessfully saved Gemini's labels and comparison to '{OUTPUT_CSV_WITH_GEMINI_LABELS}'")
except Exception as e:
    print(f"Error saving output CSV: {e}")

print("\n--- Script Finished ---")

Loaded 68 standardized topics: ['Antenna Design', 'Application Development', 'Artificial Intelligence', 'Augmented Reality', 'Automated Test and Measurement Systems']...
Processing in 3 batches of size up to 50.

--- Processing Batch 1/3 (50 sentences) ---
Attempt 1/3 for batch 1...
Successfully processed batch 1. Received 50 results.
Waiting 2s before next batch...

--- Processing Batch 2/3 (50 sentences) ---
Attempt 1/3 for batch 2...
Successfully processed batch 2. Received 50 results.
Waiting 2s before next batch...

--- Processing Batch 3/3 (50 sentences) ---
Attempt 1/3 for batch 3...
Successfully processed batch 3. Received 50 results.

Successfully parsed all Gemini batch outputs. Total results: 150
  SentenceID                         Gemini_Positive_Topics  \
0          1        [Embedded Systems, Commercial Projects]   
1          2          [Nanomaterials, Time Series Analysis]   
2          3            [Distributed Systems, Smart Cities]   
3          4  [Commercial Proje

# OPTIMAL MATCHING BEGINS HERE

## ALTERNATIVE

In [14]:
# Python Notebook - Updated to Match API Logic

import random
import pandas as pd
import re
from pulp import LpProblem, LpVariable, LpMaximize, lpSum, LpBinary
import itertools
from datetime import datetime

# 1. DATA LOADING AND PREPROCESSING

# Import supervisors data and process topics as lists
supervisors_df = pd.read_csv('data\\supervisors_standardised_gemini.csv')
students_df = pd.read_csv('data\\gemini_labeled_preferences.csv')

# --- Helper function to parse stringified lists ---
def safe_list(val):
    if isinstance(val, list):
        if 'No Match' in val:
            return []
        return val
    if pd.isna(val) or not isinstance(val, str) or not val.strip():
        return []
    try:
        # Try to parse stringified list like "['Topic A', 'Topic B']"
        if val.strip().startswith("[") and val.strip().endswith("]"):
            parsed = eval(val)
            if isinstance(parsed, list):
                if 'No Match' in val:
                    return []
                return [str(t).strip() for t in parsed if str(t).strip()]
        return [t.strip() for t in re.split('[,;]', val) if t.strip()]
    except Exception:
        return [val.strip()]

# --- Simulate Supervisor data structure ---
print("Preprocessing Supervisor Data...")
supervisors_df['supervisor_id'] = supervisors_df['Name'].apply(lambda x: x.replace(" ", ".").lower() + "@university.com")
supervisors_df['student_count'] = [0 for _ in range(len(supervisors_df))]
supervisors_df['total_capacity'] = [random.randint(5, 10) for _ in range(len(supervisors_df))]
supervisors_df['capacity'] = supervisors_df['total_capacity'] - supervisors_df['student_count']
supervisors_df['capacity'] = supervisors_df['capacity'].apply(lambda x: max(0, x))

supervisors_df = supervisors_df.rename(columns={
    'Name': 'name',
    'standardised Topics': 'expertise',
    'Preferred Programme for Supervision (1st Choice)': 'programme_first_choice',
    'Preferred Programme for Supervision (2nd Choice)': 'programme_second_choice',
})
supervisors_df['expertise'] = supervisors_df['expertise'].apply(safe_list)
supervisors_df['programme_first_choice'] = supervisors_df['programme_first_choice'].apply(safe_list)
supervisors_df['programme_second_choice'] = supervisors_df['programme_second_choice'].apply(safe_list)


# --- Simulate Student data structure  ---
print("Preprocessing Student Data...")
students_df['student_id'] = students_df['SentenceID'].apply(lambda x: f'student_{x}')
students_df['programme'] = students_df['student_id'].apply(
    lambda x: random.choice(['BCS', 'BSE', 'BIT', 'BSDA', 'BCNS'])
)
students_df = students_df.rename(columns={
    'Gemini_Positive_Topics_Str': 'positive_preferences',
    'Gemini_Negative_Topics_Str': 'negative_preferences',
})
students_df['positive_preferences'] = students_df['positive_preferences'].apply(safe_list)
students_df['negative_preferences'] = students_df['negative_preferences'].apply(safe_list)

# Select only the columns the function needs
students_df = students_df[['student_id', 'programme', 'positive_preferences', 'negative_preferences']]
supervisors_df = supervisors_df[['supervisor_id', 'name', 'programme_first_choice', 'programme_second_choice', 'capacity', 'student_count', 'expertise', 'total_capacity']]

# Display the final processed datasets
print("\n--- Processed Students Dataset (Matches API Structure) ---")
print(students_df.head())
print("\n--- Processed Supervisors Dataset (Matches API Structure) ---")
print(supervisors_df.head())

students_df_base = students_df.copy()
supervisors_df_base = supervisors_df.copy()

print(f"Base data loaded: {len(students_df_base)} students, {len(supervisors_df_base)} supervisors.")
print(f"Total available base capacity: {supervisors_df_base['capacity'].sum()}")

# 2. OPTIMAL MATCHING FUNCTION

def optimal_matching(students_df, supervisors_df, balancing_penalty_weight=0.5, score_weights={
        'prog_first_choice': 10.0,
        'prog_second_choice': 5.0,
        'student_topic_satisfaction': 8.0
    }):

    # --- 1. Pre-calculate a Unified Score for Each (Student, Supervisor) Pair ---
    # This step implements the M_r and M_Sij logic before defining the optimization problem.
    all_pair_scores = {}
    for _, student in students_df.iterrows():
        s_id = student['student_id']
        for _, supervisor in supervisors_df.iterrows():
            v_id = supervisor['supervisor_id']
            
            # --- Component A: Supervisor's Program Preference Score (implements M_r) ---
            prog_score = 0
            # Treat "No Preference" as a universal match for that choice level.
            if "No Preference" in supervisor.get('programme_first_choice', []) or \
               student['programme'] in supervisor.get('programme_first_choice', []):
                prog_score = score_weights.get('prog_first_choice', 10.0)
            elif "No Preference" in supervisor.get('programme_second_choice', []) or \
                 student['programme'] in supervisor.get('programme_second_choice', []):
                prog_score = score_weights.get('prog_second_choice', 5.0)

            # --- Component B: Student's Topic Preference Score (implements M_Sij) ---
            # B1: Positive Match Ratio = |P_i ∩ E_j| / |P_i|
            pos_prefs = student.get('positive_preferences', [])
            num_pos_prefs = len(pos_prefs)
            num_pos_matches = sum(1 for topic in pos_prefs if topic in supervisor.get('expertise', []))
            positive_match_ratio = (num_pos_matches / num_pos_prefs) if num_pos_prefs > 0 else 1.0

            # B2: Negative Avoidance Success Rate = 1 - (|N_i ∩ E_j| / |N_i|)
            neg_prefs = student.get('negative_preferences', [])
            num_neg_prefs = len(neg_prefs)
            num_neg_violations = sum(1 for topic in neg_prefs if topic in supervisor.get('expertise', []))
            violation_rate = (num_neg_violations / num_neg_prefs) if num_neg_prefs > 0 else 0.0
            negative_avoidance_rate = 1.0 - violation_rate
            
            # The M_Sij score is the average of the two components above.
            m_sij_score = (positive_match_ratio + negative_avoidance_rate) / 2.0
            
            # Weight the student's overall topic satisfaction score.
            student_topic_score = score_weights.get('student_topic_satisfaction', 8.0) * m_sij_score

            # --- Final Combined Score for the pair (s_i, r_j) ---
            final_score = prog_score + student_topic_score
            all_pair_scores[(s_id, v_id)] = final_score

    # --- 2. Define the Optimization Problem ---
    problem = LpProblem("Optimal_Student_Supervisor_Matching", LpMaximize)

    # Decision Variables: x_ij = 1 if student i is assigned to supervisor j
    decision_vars = LpVariable.dicts("x", all_pair_scores.keys(), 0, 1, LpBinary)

    # --- 3. Workload Balancing (Soft Constraint) ---
    num_new_students = len(students_df)
    num_existing_students = supervisors_df['student_count'].sum()
    num_supervisors = len(supervisors_df)
    target_load = (num_existing_students + num_new_students) / num_supervisors if num_supervisors > 0 else 0
    print(f"\nTarget total load per supervisor (existing + new): {target_load:.2f}")

    # Variables to measure deviation from the target load (linearizes the penalty)
    dev_over = LpVariable.dicts("DeviationOver", [s['supervisor_id'] for _, s in supervisors_df.iterrows()], lowBound=0)
    dev_under = LpVariable.dicts("DeviationUnder", [s['supervisor_id'] for _, s in supervisors_df.iterrows()], lowBound=0)

    for _, supervisor in supervisors_df.iterrows():
        v_id = supervisor['supervisor_id']
        existing_load = supervisor.get('student_count', 0)
        newly_assigned_load = lpSum(decision_vars[(s_id, v_id)] for s_id in students_df['student_id'])
        problem += (existing_load + newly_assigned_load) - target_load == dev_over[v_id] - dev_under[v_id], f"Define_Deviation_{v_id}"

    # --- 4. Objective Function ---
    # Maximize the sum of scores for all assignments, minus a penalty for workload imbalance.
    satisfaction_score = lpSum(decision_vars[key] * all_pair_scores[key] for key in all_pair_scores)
    workload_penalty = balancing_penalty_weight * lpSum(dev_over[v_id] + dev_under[v_id] for v_id in supervisors_df['supervisor_id'])
    
    problem += satisfaction_score - workload_penalty, "Maximize_Satisfaction_and_Balance"

    # --- 5. Hard Constraints ---
    # Constraint 1: Each student must be assigned to exactly ONE supervisor.
    for s_id in students_df['student_id']:
        problem += lpSum(decision_vars[(s_id, v_id)] for v_id in supervisors_df['supervisor_id']) == 1, f"Assign_Student_{s_id}"

    # Constraint 2: Each supervisor cannot exceed their maximum capacity.
    for _, supervisor in supervisors_df.iterrows():
        v_id = supervisor['supervisor_id']
        # The number of *newly assigned* students cannot exceed the remaining capacity.
        remaining_capacity = supervisor.get('capacity', 0)
        problem += lpSum(decision_vars[(s_id, v_id)] for s_id in students_df['student_id']) <= remaining_capacity, f"Capacity_Supervisor_{v_id}"

    # --- 6. Solve the Problem ---
    problem.solve()

    # --- 7. Result Extraction ---
    assignments = []
    if problem.status == 1: # If an optimal solution was found
        for key, var in decision_vars.items():
            if var.value() > 0.5: # If assignment was made
                student_id, supervisor_id = key
                
                # Retrieve student and supervisor info for detailed reporting
                student = students_df[students_df['student_id'] == student_id].iloc[0]
                supervisor = supervisors_df[supervisors_df['supervisor_id'] == supervisor_id].iloc[0]

                # Determine match details for reporting
                programme_match_type = 0
                if "No Preference" in supervisor.get('programme_first_choice', []) or student['programme'] in supervisor.get('programme_first_choice', []):
                    programme_match_type = 1
                elif "No Preference" in supervisor.get('programme_second_choice', []) or student['programme'] in supervisor.get('programme_second_choice', []):
                    programme_match_type = 2

                matching_topics = [t for t in student.get('positive_preferences', []) if t in supervisor.get('expertise', [])]
                conflicting_topics = [t for t in student.get('negative_preferences', []) if t in supervisor.get('expertise', [])]
                
                assignments.append({
                    'student_id': student_id,
                    'supervisor_id': supervisor_id,
                    'supervisor_name': supervisor.get('name', 'N/A'),
                    'programme_match': programme_match_type, # 1 for first choice, 2 for second, 0 for other
                    'matching_topics': matching_topics if matching_topics else ["No Matches"],
                    'conflicting_topics': conflicting_topics if conflicting_topics else ["No Conflicts"],
                    'match_score': all_pair_scores[key] # Report the exact score used by the optimizer
                })
    return assignments


# 3. EXECUTION AND ANALYSIS

def run_analysis_and_get_stats(students_df, supervisors_df, balancing_penalty_weight, score_weights):
    """
    Runs the optimal matching algorithm and calculates a specific set of KPIs based on
    the formal mathematical models for satisfaction and workload.

    Args:
        students_df (pd.DataFrame): The students data.
        supervisors_df (pd.DataFrame): The supervisors data.
        balancing_penalty_weight (float): The penalty factor for workload deviation.
        score_weights (dict): A dictionary of weights for different score components.

    Returns:
        dict: A dictionary containing the calculated KPIs, or None if no assignments were made.
    """
    print(f"\n--- Running: balance_weight={balancing_penalty_weight}, scores={score_weights} ---")
    
    # Run the main optimization algorithm
    assignments = optimal_matching(students_df, supervisors_df, balancing_penalty_weight, score_weights)
    
    if not assignments:
        print("No assignments could be made.")
        return None

    assignments_df = pd.DataFrame(assignments)
    
    # --- KPI Calculation ---

    # KPI 1: Supervisor First Choice Percentage
    # This measures the percentage of all assignments where the student matched the supervisor's first program choice.
    first_choice_count = (assignments_df['programme_match'] == 1).sum()
    total_assignments = len(assignments_df)
    supervisor_first_choice_pct = (first_choice_count / total_assignments) * 100 if total_assignments > 0 else 0

    # KPI 2: Average Student Satisfaction Rate (M_Sij)
    # This calculates the average M_Sij score for all assigned students, directly from the model.
    # M_Sij = (Positive Match Ratio + Negative Avoidance Rate) / 2
    
    # First, merge necessary data into one DataFrame for easy calculation
    analysis_df = assignments_df.merge(
        students_df[['student_id', 'positive_preferences', 'negative_preferences']], on='student_id'
    ).merge(
        supervisors_df[['supervisor_id', 'expertise']], on='supervisor_id'
    )

    def calculate_m_sij_score(row):
        # Positive Match Ratio
        pos_prefs = row.get('positive_preferences', [])
        num_pos_matches = sum(1 for topic in pos_prefs if topic in row.get('expertise', []))
        pos_ratio = (num_pos_matches / len(pos_prefs)) if len(pos_prefs) > 0 else 1.0 # 100% fulfilled if no prefs

        # Negative Avoidance Rate
        neg_prefs = row.get('negative_preferences', [])
        num_neg_violations = sum(1 for topic in neg_prefs if topic in row.get('expertise', []))
        neg_ratio = 1.0 - ((num_neg_violations / len(neg_prefs)) if len(neg_prefs) > 0 else 0.0)

        return (pos_ratio + neg_ratio) / 2.0

    analysis_df['m_sij_score'] = analysis_df.apply(calculate_m_sij_score, axis=1)
    avg_student_satisfaction_rate = analysis_df['m_sij_score'].mean() * 100

    # KPI 3: Workload Standard Deviation
    # This measures the fairness of the student distribution among supervisors.
    # A lower value is better, with 0 being a perfectly balanced workload.
    supervisor_assignments = assignments_df.groupby('supervisor_id').size().reset_index(name='newly_assigned_count')
    
    # Ensure all supervisors are included, even those with zero new assignments
    supervisor_load_df = supervisors_df.merge(supervisor_assignments, on='supervisor_id', how='left')
    supervisor_load_df['newly_assigned_count'] = supervisor_load_df['newly_assigned_count'].fillna(0)
    
    # Calculate final load including existing students
    supervisor_load_df['final_student_count'] = supervisor_load_df['student_count'] + supervisor_load_df['newly_assigned_count']
    workload_std_dev = supervisor_load_df['final_student_count'].std()

    return {
        "first_choice_pct": supervisor_first_choice_pct,
        "avg_student_satisfaction_rate": avg_student_satisfaction_rate,
        "workload_std_dev": workload_std_dev,
    }
    
# 4. SYSTEMATIC CONFIGURATION GENERATION

# Step 1: Define the low, medium, and high values for each parameter.
# These values are chosen based on intuition and previous results.
parameter_levels = {
    'balancing_penalty_weight': {'low': 1.0, 'medium': 2.0, 'high': 5.0, 'very_high': 10.0},
    'prog_first_choice': {'low': 10, 'medium': 20, 'high': 50, 'very_high': 100},
    'prog_second_choice': {'low': 5, 'medium': 10, 'high': 25, 'very_high': 50},
    'student_topic_satisfaction': {'low': 10, 'medium': 20, 'high': 50, 'very_high': 100},
}

# Step 2: Create iterables for the product function
balance_levels = list(parameter_levels['balancing_penalty_weight'].values())
prog_first_choice_levels = list(parameter_levels['prog_first_choice'].values())
prog_second_choice_levels = list(parameter_levels['prog_second_choice'].values())
student_topic_satisfaction_levels = list(parameter_levels['student_topic_satisfaction'].values())


# Step 3: Use a loop with itertools.product to generate all combinations.
test_configs = {}
config_counter = 1

# itertools.product generates the Cartesian product of all the lists.
for combo in itertools.product(
    balance_levels, prog_first_choice_levels, prog_second_choice_levels, 
    student_topic_satisfaction_levels
):
    # Unpack the combination tuple
    bal_val, p1_val, p2_val, pos_val= combo
    
    # Create the score_weights dictionary for this specific combination
    current_score_weights = {
        'prog_first_choice': p1_val,
        'prog_second_choice': p2_val,
        'student_topic_satisfaction': pos_val,
    }
    
    # Store the configuration with a unique name
    config_name = f"Config_{config_counter}"
    test_configs[config_name] = (bal_val, current_score_weights)
    config_counter += 1

print(f"Generated {len(test_configs)} unique test configurations for the grid search.")
# With 4 parameters each having 4 levels, this will be 4^4 = 256 configurations.


# 5. PARETO OPTIMALITY ANALYSIS (This section is the same as before)

def find_pareto_optimal_configs(results_df, kpi_goals):
    pareto_front_indices = []
    for idx, candidate in results_df.iterrows():
        is_dominated = False
        for other_idx, other in results_df.iterrows():
            if idx == other_idx: continue
            is_better_on_one, is_worse_on_any = False, False
            for kpi, goal in kpi_goals.items():
                if goal == 'maximize':
                    if other[kpi] < candidate[kpi]: is_worse_on_any = True; break
                    if other[kpi] > candidate[kpi]: is_better_on_one = True
                elif goal == 'minimize':
                    if other[kpi] > candidate[kpi]: is_worse_on_any = True; break
                    if other[kpi] < candidate[kpi]: is_better_on_one = True
            if not is_worse_on_any and is_better_on_one:
                is_dominated = True; break
        if not is_dominated: pareto_front_indices.append(idx)
    return results_df.loc[pareto_front_indices]


# --- Run all tests and collect results ---
print("\n--- Section 5: Starting exhaustive grid search. This may take some time... ---")
results = []
for name, (balance_weight, score_weights) in test_configs.items():
    stats = run_analysis_and_get_stats(
        students_df_base.copy(), supervisors_df_base.copy(),
        balance_weight, score_weights
    )
    if stats:
        stats['config_name'] = name
        results.append(stats)

results_df = pd.DataFrame(results).set_index('config_name')


# --- Identify the Pareto Front ---
target_kpi_goals = {
    'first_choice_pct': 'maximize',
    'workload_std_dev': 'minimize',
    'avg_student_satisfaction_rate': 'maximize'
}
pareto_df = find_pareto_optimal_configs(results_df, target_kpi_goals)

print("\n--- Analysis Complete: Displaying Results ---")
print(f"Found {len(pareto_df)} configurations on the Pareto Front.")
print(pareto_df[target_kpi_goals.keys()].round(2))


# --- Final Recommendation ---
if not pareto_df.empty:
    final_choice = pareto_df.sort_values(by=['first_choice_pct', 'workload_std_dev'], ascending=[False, True])
    best_config_name = final_choice.index[0]
    best_config_params = test_configs[best_config_name]
else:
    print("Warning: No distinct Pareto Front found. Defaulting to best on primary metric.")
    final_choice = results_df.sort_values(by=['first_choice_pct', 'workload_std_dev'], ascending=[False, True])
    best_config_name = final_choice.index[0]
    best_config_params = test_configs[best_config_name]


# 6. SAVE RESULTS TO FILES
print("\n--- Section 6: Saving Results to Files ---")

# --- A. Save the COMPLETE analysis file ---
complete_filename = 'grid_search_complete_results.csv'
try:
    params_df = pd.DataFrame.from_dict(test_configs, orient='index', columns=['balance_weight', 'score_weights'])
    full_results_with_params = results_df.join(params_df)
    
    full_results_with_params.to_csv(complete_filename)
    print(f"Successfully saved all {len(results_df)} configuration results to '{complete_filename}'")
except Exception as e:
    print(f"Error saving complete results file: {e}")


# --- B. Save the formatted SUMMARY report file ---
summary_filename = 'DELETE_THIS_summary_report.txt'
pareto_csv_filename = 'DELETE_THIS_data.csv'  # New filename for the CSV

configs_df = pd.DataFrame.from_dict(
    test_configs, 
    orient='index', 
    columns=['balancing_penalty_weight', 'score_weights']
)
pareto_with_params_df = pareto_df.merge(
    configs_df, 
    left_index=True, 
    right_index=True
)

try:
    with open(summary_filename, 'w') as f:
        # Report Header
        f.write("--- SUMMARY REPORT: OPTIMAL CONFIGURATION ANALYSIS ---\n")
        f.write(f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Total Configurations Tested: {len(test_configs)}\n")
        f.write("="*80 + "\n")

        # Pareto Front Section
        f.write("\n--- Pareto Front: The Best Set of Trade-Offs ---\n")
        f.write("These are non-dominated configurations, representing optimal trade-offs.\n\n")

        # Define columns for the text report
        kpi_cols = list(target_kpi_goals.keys())
        param_cols = ['balancing_penalty_weight'] 
        
        f.write("KPIs and Balancing Weight for each Pareto Configuration:\n")
        f.write(pareto_with_params_df[kpi_cols + param_cols].round(2).to_string())
        f.write("\n\n")
        
        f.write("Detailed Score Weights for each Pareto Configuration:\n")
        for index, row in pareto_with_params_df.iterrows():
            f.write(f"\nConfig Name: {index}\n")
            score_weights_str = "\n".join([f"  - {key}: {value}" for key, value in row['score_weights'].items()])
            f.write(f"Score Weights:\n{score_weights_str}\n")
            f.write("-" * 40)

        f.write("\n\n" + "="*80 + "\n")

        # Final Recommendation Section
        f.write("\n--- Final Recommendation ---\n")
        # ... (rest of the recommendation section is unchanged) ...
        f.write("Decision Process: From the Pareto Front, the configuration with the best 'First Choice Percentage' (highest value up to 100.0) and 'Workload Standard Deviation' (lower is better) was selected.\n\n")
        f.write("Recommended Configuration Details:\n")
        f.write(final_choice.iloc[0:1][target_kpi_goals.keys()].round(2).to_string())
        
        f.write("\n\nCorresponding Parameters for this Configuration:\n")
        f.write(f"Name: {best_config_name}\n")
        f.write(f"Balancing Penalty Weight: {best_config_params[0]}\n")
        score_weights_str_best = "\n".join([f"  - {key}: {value}" for key, value in best_config_params[1].items()])
        f.write(f"Score Weights:\n{score_weights_str_best}\n")
        
    print(f"Successfully saved human-readable summary report to '{summary_filename}'")
except Exception as e:
    print(f"Error saving summary report file: {e}")


# --- C. Save the Pareto front data to a MACHINE-READABLE file (.csv) ---
try:
    # To make the CSV clean, we "flatten" the 'score_weights' dictionary into separate columns.
    # 1. Create a new DataFrame from the dictionary column
    weights_df = pareto_with_params_df['score_weights'].apply(pd.Series)
    
    # 2. Join this new DataFrame back to the original one
    df_for_csv = pareto_with_params_df.join(weights_df)
    
    # 3. Drop the original dictionary column as it's now redundant
    df_for_csv = df_for_csv.drop(columns=['score_weights'])

    # 4. Save the flattened DataFrame to a CSV file
    df_for_csv.to_csv(pareto_csv_filename, index_label='config_name')
    
    print(f"Successfully saved Pareto front data to '{pareto_csv_filename}'")
except Exception as e:
    print(f"Error saving Pareto front CSV file: {e}")

Preprocessing Supervisor Data...
Preprocessing Student Data...

--- Processed Students Dataset (Matches API Structure) ---
  student_id programme                           positive_preferences  \
0  student_1       BSE        [Embedded Systems, Commercial Projects]   
1  student_2      BCNS          [Nanomaterials, Time Series Analysis]   
2  student_3       BIT            [Distributed Systems, Smart Cities]   
3  student_4      BSDA  [Commercial Projects, Information Technology]   
4  student_5      BSDA            [Electronics, Information Security]   

                                negative_preferences  
0                      [Smart Cities, Nanomaterials]  
1  [Application Development, Natural Language Pro...  
2        [Distributed haptics, Software Engineering]  
3  [Software Engineering, Natural Language Proces...  
4                          [Neuroscience, Databases]  

--- Processed Supervisors Dataset (Matches API Structure) ---
                                       superv