<a href="https://colab.research.google.com/github/Shubawa/sc1003_mini_project/blob/main/SC1003_Mini_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Main Code**

In [121]:
# Mount google drive to access files
from google.colab import drive
drive.mount('/content/drive/')

def read_and_parse_file_content(filepath):
  records = {}
  with open(filepath, "r") as file:
    content = file.read()
    lines = content.split("\n")

    # First line is the header
    for line in lines[1:]:
      # Skip empty lines
      if not line:
        continue

      record = line.split(",")

      tutorial_group = record[0]
      student_id     = record[1]
      school        = record[2]
      name          = record[3]
      gender        = record[4]
      cgpa          = record[5]

      # Initialize empty list if not yet
      if not tutorial_group in records:
        records[tutorial_group] = []

      records[tutorial_group].append({
        'tutorial_group': tutorial_group,
        'student_id': student_id,
        'school': school,
        'name': name,
        'gender': gender,
        'cgpa': cgpa
      })

  return records

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [122]:
# Calculate dynamic CGPA categories based on percentiles
def calculate_cgpa_categories(students, team_size=5):
  # Extract all CGPAs and sort them
  cgpas = sorted([float(s['cgpa']) for s in students])

  if not cgpas:
    return {}

  # Create percentile-based categories
  cgpa_to_category = {}
  num_categories = team_size
  students_per_category = len(cgpas) / num_categories

  for i, cgpa in enumerate(cgpas):
    category = min(int(i / students_per_category), num_categories - 1)
    cgpa_to_category[cgpa] = category

  return cgpa_to_category

In [123]:
def categorise_gpa(gpa, cgpa_mapping=None):
  if cgpa_mapping is not None:
    return cgpa_mapping.get(float(gpa), 0)

In [124]:
def calculate_diversity_score(team, cgpa_mapping=None):
  schools = set([s['school'] for s in team ])
  genders = set([s['gender'] for s in team])
  cgpa    = set([categorise_gpa(s['cgpa'], cgpa_mapping) for s in team])

  return len(schools) + len(cgpa) + len(genders)

In [125]:
# Currently only for gender, possible to split by schools as well
def distribute_students_proportions(student_by_categories, category, size):
  total_count = sum([len(cat) for cat in student_by_categories])

  result = {}

  remaining_allocation = size

  for index, students in enumerate(student_by_categories):

    if not students:
      continue

    cat = students[0][category]

    if index == len(student_by_categories) - 1:
      result[cat] = remaining_allocation
      break

    proportion = (len(students) / total_count) * size
    proportion = round(proportion)
    remaining_allocation -= proportion
    result[cat] = proportion

  return result

In [126]:
import random
def find_best_match_student(students, team, cgpa_mapping=None):
  # Default to a random employee if no best match found
  best_match = random.sample(students, 1)[0]
  best_score = -1

  for student in students:
    temp_team = team.copy()
    temp_team.append(student)

    diversity_score = calculate_diversity_score(temp_team, cgpa_mapping)

    if diversity_score > best_score:
      best_match = student
      best_score = diversity_score

  return best_match

In [127]:
import math
def split_project_teams(students, team_offset=1, team_size=5):
  students_count = len(students)

  num_teams = math.ceil(students_count / team_size)
  teams = {}

  # Calculate CGPA categories based on team size
  cgpa_mapping = calculate_cgpa_categories(students, team_size)

  males = []
  females = []

  for student in students:
    if student['gender'] == 'Male':
      males.append(student)
    else:
      females.append(student)

  for i in range(num_teams):
    remaining_students = males + females

    # Handle the imbalance distribution of team size, like 15 split by 6 => 6, 6, 3?
    # We will assign the remainder of 3 to the previous teams
    if len(remaining_students) < team_size - 1:
      remainder = len(remaining_students)
      total_teams = len(teams)

      keys = list(teams.keys())

      for i in range(remainder):
        team_index = i % total_teams
        key = keys[team_index]
        team = teams[key]
        student = find_best_match_student(remaining_students, team, cgpa_mapping)
        team.append(student)
        remaining_students.remove(student)

      continue

    group_number = i + team_offset

    teams[group_number] = []

    # We assign students based on the gender proportion so that it's evenly distributed
    proportions = distribute_students_proportions([males, females], 'gender', team_size)

    for gender, size in proportions.items():
      selected_students = males if gender == 'Male' else females

      for _ in range(size):
        if not selected_students:
          continue

        student = find_best_match_student(selected_students, teams[group_number], cgpa_mapping)
        teams[group_number].append(student)
        selected_students.remove(student)

      if not males and not females:
        break

  return teams, group_number

In [128]:
def output_to_csv(teams):
  filepath = "/content/drive/My Drive/Colab Notebooks/diversified_teams.csv"
  with open(filepath, "w") as file:
    headers = "Tutorial Group,Student ID,School,Name,Gender,CGPA,Team Assigned\n"

    # Write headers of csv file
    file.write(headers)

    # iterate through teams and write them to the csv file
    for group_number in teams:
      team = teams[group_number]
      for student in team:
        row_output = f"{student['tutorial_group']},{student['student_id']},{student['school']},{student['name']},{student['gender']},{student['cgpa']},{group_number}\n"
        file.write(row_output)

In [129]:
def read_file_and_split_project_teams(team_size=5):
  filepath = '/content/drive/My Drive/Colab Notebooks/records.csv'
  records = read_and_parse_file_content(filepath)

  teams = {}

  current_group_number = 1

  for tutorial_group in records:
    students = records[tutorial_group]
    current_teams, current_group_number = split_project_teams(students, current_group_number, team_size)

    # Merging of dictionaries {} + {}
    teams = teams | current_teams

  output_to_csv(teams)
  # print_and_get_diversity_info(teams)

In [130]:
import ipywidgets as widgets

def on_button_click(_):
  with output:
      output.clear_output()
      try:
        team_size = int(text_input.value)
        print("Splitting project teams with size: ", team_size)
        read_file_and_split_project_teams(team_size)
        print("Done!")
      except:
        print("Wrong input, please enter team size in integer.")

text_input = widgets.Text(value= "5", description='Team Size:')

button = widgets.Button(description="Split Project Teams")
button.on_click(on_button_click)

output = widgets.Output()

display(text_input, button, output)

Text(value='5', description='Team Size:')

Button(description='Split Project Teams', style=ButtonStyle())

Output()

**Helper function to check the distribution**

In [131]:
def get_max_count(items):
  max_count = 0
  uniq_items = set(items)

  for i in uniq_items:
    current_count = 0
    for j in items:
      if i == j:
        current_count += 1

    if current_count > max_count:
      max_count = current_count

  return max_count

def print_and_get_diversity_info(teams):

  total_count = 0
  total_diversity_score = 0

  for group_number in teams:
    team = teams[group_number]

    avergae_gpa = sum([float(s['cgpa']) for s in team]) / len(team)
    number_of_students = len(team)
    number_of_males = len([s for s in team if s['gender'] == 'Male'])
    number_of_females = len([s for s in team if s['gender'] == 'Female'])
    diversity_score   = calculate_diversity_score(team)
    schools = [s['school'] for s in team ]

    total_count += 1
    total_diversity_score += diversity_score

    print("Group ", group_number, end=" | ")
    print("Tutorial Group", team[0]['tutorial_group'], end=" | ")
    print("Number of students: ", number_of_students, end=" | ")
    print("Schools max count: ", get_max_count(schools), end=" | ")
    print("Male: ", number_of_males, end=" | ")
    print("Female: ", number_of_females, end=" | ")
    print("Average GPA: ", avergae_gpa, end=" | ")
    print("Diversity Score: ", diversity_score)