COSINE SIMILARITY AND HUNGARIAN ALGORITHM

In [88]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.optimize import linear_sum_assignment
import numpy as np
from sklearn.preprocessing import StandardScaler

data = pd.read_excel("Housing_Data.xlsx")

print(data.head())

# Check for missing values in the dataset
print("Missing values before cleaning:")
print(data.isnull().sum())

# List of columns to drop
columns_to_drop = [
    "ID", "Email", "Name", "Full Name (First, Middle, Last Names)",
    "Year Group", "Student ID Number", "Phone Number",
    "Are you interested in staying in Ashesi's halls of residence next academic year?",
    "What is the specific need?", "Please state any special accomodations you will need."
]

# Drops the specified columns
data_cleaned = data.drop(columns=columns_to_drop)

# Replaces NaN values with 0
data_cleaned = data_cleaned.fillna(0)

# Checks if there are any missing values after cleaning
print("\nMissing values after cleaning:")
print(data_cleaned.isnull().sum())

# Displays the cleaned dataset
print("\nCleaned Data Info:")
print(data_cleaned.head())


   ID                            Email                      Name  \
0   1      ababasa.roger@ashesi.edu.gh             Ababasa Roger   
1   2  abdoul.kandegomni@ashesi.edu.gh  Abdoul Nasser Kandégomni   
2   3    abdoulaye.garba@ashesi.edu.gh  Abdoulaye Moumouni Garba   
3   4    rahman.abubakar@ashesi.edu.gh     Abdul Rahman Abubakar   
4   5        abdul.salia@ashesi.edu.gh         Abdul-Mumin Salia   

  Are you interested in staying in Ashesi's halls of residence next academic year?  \
0                                                Yes                                 
1                                                Yes                                 
2                                                Yes                                 
3                                                Yes                                 
4                                                Yes                                 

  Full Name (First, Middle, Last Names) Gender     Year Group  \
0        

In [89]:
# Mapping for each column
gender_mapping = {'Male': 0, 'Female': 1}
financial_aid_mapping = {'Yes': 1, 'No': 0}
personality_mapping = {'I am introverted': 0, 'Somewhere in between': 1, 'I am extroverted': 2}
bedtime_mapping = {'10PM or earlier': 0, 'Between 10PM and Midnight (12AM)': 1, 'After Midnight (12AM)': 2}
study_location_mapping = {'In the room': 0, 'Away from the room': 1}
fun_activity_mapping = {'Relaxing/Sleeping': 0, 'Hanging out with friends': 1, 'Going for events on/off-campus': 2}
friends_over_mapping = {'Nope!': 0, 'Not really, but I don\'t mind': 1, 'Yup, all the time': 2, 'Yup, but not always': 3}
friends_description_mapping = {'Friendly': 0, 'Fun': 1, 'Responsible': 2, 'Chill': 3, 'Organized': 4}

# Applies the transformations
data_cleaned['Gender'] = data_cleaned['Gender'].replace(gender_mapping)
data_cleaned['Are you on any form of financial aid?'] = data_cleaned['Are you on any form of financial aid?'].replace(financial_aid_mapping)
data_cleaned['How would you describe your personality?'] = data_cleaned['How would you describe your personality?'].replace(personality_mapping)
data_cleaned['What time do you typically go to bed on weeknights?'] = data_cleaned['What time do you typically go to bed on weeknights?'].replace(bedtime_mapping)
data_cleaned['Where do you prefer to do homework/study?'] = data_cleaned['Where do you prefer to do homework/study?'].replace(study_location_mapping)
data_cleaned['What do you enjoy doing for fun?'] = data_cleaned['What do you enjoy doing for fun?'].replace(fun_activity_mapping)
data_cleaned['Do you enjoy having friends over in your room?'] = data_cleaned['Do you enjoy having friends over in your room?'].replace(friends_over_mapping)
data_cleaned['How would your friends describe you in one word?'] = data_cleaned['How would your friends describe you in one word?'].replace(friends_description_mapping)

# Handling any other unlisted options
data_cleaned['How would your friends describe you in one word?'] = data_cleaned['How would your friends describe you in one word?'].apply(
    lambda x: friends_description_mapping.get(x, 5)
)

# Displays the transformed data
print(data_cleaned.head())


   Gender  Are you on any form of financial aid?  \
0       0                                      1   
1       0                                      1   
2       0                                      1   
3       0                                      1   
4       0                                      1   

     What's your financial aid status?  \
0  Mastercard Foundation Scholar (MCF)   
1  Mastercard Foundation Scholar (MCF)   
2  Mastercard Foundation Scholar (MCF)   
3  Mastercard Foundation Scholar (MCF)   
4  Mastercard Foundation Scholar (MCF)   

  Do you have any special accessibility needs?  \
0                                           No   
1                                           No   
2                                           No   
3                                           No   
4                                           No   

   How would you describe your personality?  \
0                                         1   
1                                      

  data_cleaned['Gender'] = data_cleaned['Gender'].replace(gender_mapping)
  data_cleaned['Are you on any form of financial aid?'] = data_cleaned['Are you on any form of financial aid?'].replace(financial_aid_mapping)
  data_cleaned['How would you describe your personality?'] = data_cleaned['How would you describe your personality?'].replace(personality_mapping)
  data_cleaned['What time do you typically go to bed on weeknights?'] = data_cleaned['What time do you typically go to bed on weeknights?'].replace(bedtime_mapping)
  data_cleaned['Where do you prefer to do homework/study?'] = data_cleaned['Where do you prefer to do homework/study?'].replace(study_location_mapping)
  data_cleaned['What do you enjoy doing for fun?'] = data_cleaned['What do you enjoy doing for fun?'].replace(fun_activity_mapping)
  data_cleaned['Do you enjoy having friends over in your room?'] = data_cleaned['Do you enjoy having friends over in your room?'].replace(friends_over_mapping)


In [90]:
# Separates the data into males and females
males = data_cleaned[data_cleaned['Gender'] == 0]
females = data_cleaned[data_cleaned['Gender'] == 1]

# Displays the size of each
print("Number of males:", len(males))
print("Number of females:", len(females))

# Displays the first few rows of each
print("\nMales Group:")
print(males.head())

print("\nFemales Group:")
print(females.head())


Number of males: 222
Number of females: 305

Males Group:
   Gender  Are you on any form of financial aid?  \
0       0                                      1   
1       0                                      1   
2       0                                      1   
3       0                                      1   
4       0                                      1   

     What's your financial aid status?  \
0  Mastercard Foundation Scholar (MCF)   
1  Mastercard Foundation Scholar (MCF)   
2  Mastercard Foundation Scholar (MCF)   
3  Mastercard Foundation Scholar (MCF)   
4  Mastercard Foundation Scholar (MCF)   

  Do you have any special accessibility needs?  \
0                                           No   
1                                           No   
2                                           No   
3                                           No   
4                                           No   

   How would you describe your personality?  \
0                           

In [91]:
males_features = males[['Are you on any form of financial aid?', 'How would you describe your personality?',
                         'What time do you typically go to bed on weeknights?', 'Where do you prefer to do homework/study?',
                         'What do you enjoy doing for fun?', 'Do you enjoy having friends over in your room?',
                         'How would your friends describe you in one word?']]

# Normalizes the data to prevent high similarity
scaler = StandardScaler()
males_features_normalized = scaler.fit_transform(males_features)

# Calculates the cosine similarity matrix for males
similarity_matrix_males = cosine_similarity(males_features_normalized)

# Ensures no student is paired with themselves
np.fill_diagonal(similarity_matrix_males, 0)

groups = []

students = list(range(len(males)))

# Loops until all students are grouped
while students:
    group_size = np.random.randint(2, 5)

    # Makes sure the remaining students are enough for the group
    group_size = min(group_size, len(students))

    # Create a group from the most similar students
    group = []
    for _ in range(group_size):
        if not students:
            break

        # Gets the student with the highest similarity to any in the current group
        if not group:
            student = students.pop(np.random.randint(len(students)))
        else:
            # Calculates similarity to already chosen students and selects the highest
            similarities = np.sum(similarity_matrix_males[students][:, group], axis=1)
            student = students.pop(np.argmax(similarities))

        group.append(student)

    groups.append(group)

# Display the grouping results
print("\nMale Roommate Groups:")
for idx, group in enumerate(groups, start=1):
    room_name = f"Room M{idx}"
    students_in_room = [f"Student {males.index[student_id]}" for student_id in group]
    print(f"{room_name}: {' and '.join(students_in_room)}")



Male Roommate Groups:
Room M1: Student 451 and Student 55
Room M2: Student 318 and Student 1
Room M3: Student 146 and Student 287 and Student 388
Room M4: Student 373 and Student 0 and Student 103 and Student 106
Room M5: Student 74 and Student 126 and Student 145 and Student 154
Room M6: Student 241 and Student 234 and Student 235 and Student 262
Room M7: Student 239 and Student 162 and Student 174 and Student 185
Room M8: Student 504 and Student 85 and Student 363
Room M9: Student 438 and Student 271
Room M10: Student 333 and Student 66 and Student 227
Room M11: Student 5 and Student 508
Room M12: Student 144 and Student 110 and Student 237
Room M13: Student 132 and Student 114 and Student 184 and Student 383
Room M14: Student 82 and Student 112 and Student 130 and Student 242
Room M15: Student 490 and Student 190
Room M16: Student 403 and Student 143 and Student 220 and Student 382
Room M17: Student 351 and Student 113 and Student 344
Room M18: Student 270 and Student 142
Room M19:

In [92]:
females_features = females[['Are you on any form of financial aid?', 'How would you describe your personality?',
                             'What time do you typically go to bed on weeknights?', 'Where do you prefer to do homework/study?',
                             'What do you enjoy doing for fun?', 'Do you enjoy having friends over in your room?',
                             'How would your friends describe you in one word?']]

# Normalizes the data to prevent high similarity due to feature scale
scaler = StandardScaler()
females_features_normalized = scaler.fit_transform(females_features)

# Calculates the cosine similarity matrix for females
similarity_matrix_females = cosine_similarity(females_features_normalized)

# Ensures no student is paired with themselves
np.fill_diagonal(similarity_matrix_females, 0)

groups_females = []

students_females = list(range(len(females)))

while students_females:
    group_size = np.random.randint(2, 5)

    group_size = min(group_size, len(students_females))

    # Creates a group from the most similar students
    group = []
    for _ in range(group_size):
        if not students_females:
            break

        if not group:
            student = students_females.pop(np.random.randint(len(students_females)))
        else:
            # Calculates similarity to already chosen students and select the highest
            similarities = np.sum(similarity_matrix_females[students_females][:, group], axis=1)
            student = students_females.pop(np.argmax(similarities))

        group.append(student)

    groups_females.append(group)

# Display the grouping results
print("\nFemale Roommate Groups:")
for idx, group in enumerate(groups_females, start=1):
    room_name = f"Room F{idx}"
    students_in_room = [f"Student {females.index[student_id]}" for student_id in group]
    print(f"{room_name}: {' and '.join(students_in_room)}")



Female Roommate Groups:
Room F1: Student 317 and Student 9
Room F2: Student 218 and Student 104 and Student 118
Room F3: Student 40 and Student 180 and Student 503 and Student 221
Room F4: Student 406 and Student 39 and Student 172
Room F5: Student 347 and Student 128 and Student 159 and Student 179
Room F6: Student 350 and Student 60
Room F7: Student 88 and Student 116 and Student 119
Room F8: Student 188 and Student 62 and Student 224
Room F9: Student 105 and Student 268 and Student 295
Room F10: Student 360 and Student 421 and Student 63
Room F11: Student 294 and Student 32 and Student 95
Room F12: Student 187 and Student 408 and Student 69 and Student 392
Room F13: Student 440 and Student 248 and Student 419 and Student 431
Room F14: Student 522 and Student 517 and Student 518
Room F15: Student 181 and Student 213 and Student 258
Room F16: Student 31 and Student 18 and Student 96
Room F17: Student 45 and Student 412
Room F18: Student 6 and Student 19
Room F19: Student 264 and Stud