# Import Required Libraries

In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

# Load Data Function

In [3]:
def load_data(filepath):
    return pd.read_csv(filepath)

# Prepare Similarity Matrix

In [4]:
def prepare_similarity_matrix(df):
    # Convert categorical preferences to numerical values
    df['Smoking_Num'] = df['Okay with Smoking'].map({'Yes': 1, 'Maybe': 0.5, 'No': 0})
    df['Visitors_Num'] = df['Okay with Visitors'].map({'Yes': 1, 'Maybe': 0.5, 'No': 0})
    
    # Define features to use for compatibility matching
    features = ['Sleep Type', 'Cleanliness Importance', 'Social Type', 
                'Smoking_Num', 'Visitors_Num', 'Noise Tolerance', 'Annoying Habit']
    
    # Set up preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), 
                ['Sleep Type', 'Social Type', 'Annoying Habit']),
            ('num', MinMaxScaler(), 
                ['Cleanliness Importance', 'Smoking_Num', 'Visitors_Num', 'Noise Tolerance'])
        ])
    
    # Transform the data
    X = preprocessor.fit_transform(df[features])
    
    # Store preprocessor for new user input
    global user_preprocessor 
    user_preprocessor = preprocessor
    
    return cosine_similarity(X)

# Clean User Input

In [5]:
def clean_user_input(value, field_name):
    """Clean and standardize user input"""
    value = str(value).strip()
    
    # Handle case variations for specific fields
    if field_name in ['Gender', 'Sleep Type', 'Social Type', 'Okay with Smoking', 'Okay with Visitors']:
        value = value.lower().capitalize()
        if field_name == 'Okay with Smoking' and value == 'Yes':
            value = 'Yes'  # Handle 'YEs' -> 'Yes'
    
    # Handle numeric fields
    if field_name in ['Cleanliness Importance', 'Noise Tolerance']:
        try:
            value = int(value)
            if value < 1 or value > 5:
                print(f"Warning: {field_name} should be between 1-5. Using 3 as default.")
                value = 3
        except ValueError:
            print(f"Warning: Invalid number for {field_name}. Using 3 as default.")
            value = 3
    
    return value

# Collect User Preferences

In [6]:
def get_user_input():
    print("\nPlease enter your roommate preferences:")
    user_data = {
        'Name': ['User_Input'],
        'Gender': clean_user_input(input("Gender (Male/Female/Prefer not to say): "), 'Gender'),
        'Program of Study': input("Program of Study (e.g., B.Tech CSE): ").upper(),
        'Sleep Type': clean_user_input(input("Sleep Type (Early Bird/Night Owl): "), 'Sleep Type'),
        'Cleanliness Importance': clean_user_input(input("Cleanliness Importance (1-5, where 5 is most important): "), 'Cleanliness Importance'),
        'Social Type': clean_user_input(input("Social Type (Introverted/Extroverted/Ambiverted): "), 'Social Type'),
        'Okay with Smoking': clean_user_input(input("Okay with Smoking? (Yes/No/Maybe): "), 'Okay with Smoking'),
        'Okay with Visitors': clean_user_input(input("Okay with Visitors? (Yes/No/Maybe): "), 'Okay with Visitors'),
        'Noise Tolerance': clean_user_input(input("Noise Tolerance (1-5, where 1 is least tolerant): "), 'Noise Tolerance'),
        'Annoying Habit': input("Your annoying habit (e.g., Messy room, Loud music, etc.): ").title()
    }
    return pd.DataFrame(user_data)

# Find Compatible Roommates

In [7]:
def find_compatible_roommates(df, similarity_matrix, user_df=None, student_name=None, n=5, min_score=0.5):
    if user_df is not None:
        # Process user input along with existing data
        user_df['Smoking_Num'] = user_df['Okay with Smoking'].map({'Yes': 1, 'Maybe': 0.5, 'No': 0})
        user_df['Visitors_Num'] = user_df['Okay with Visitors'].map({'Yes': 1, 'Maybe': 0.5, 'No': 0})
        
        # Transform user input using the preprocessor
        features = ['Sleep Type', 'Cleanliness Importance', 'Social Type', 
                   'Smoking_Num', 'Visitors_Num', 'Noise Tolerance', 'Annoying Habit']
        user_X = user_preprocessor.transform(user_df[features])
        
        # Transform original data
        df['Smoking_Num'] = df['Okay with Smoking'].map({'Yes': 1, 'Maybe': 0.5, 'No': 0})
        df['Visitors_Num'] = df['Okay with Visitors'].map({'Yes': 1, 'Maybe': 0.5, 'No': 0})
        X = user_preprocessor.transform(df[features])
        
        # Calculate similarity between user and all students
        user_similarity = cosine_similarity(user_X, X)
        sim_scores = list(enumerate(user_similarity[0]))  # Get first (and only) row
    else:
        # Original functionality for student-student matching
        try:
            student_idx = df[df['Name'] == student_name].index[0]
            sim_scores = list(enumerate(similarity_matrix[student_idx]))
        except IndexError:
            print(f"Student {student_name} not found in dataset")
            return pd.DataFrame()
    
    # Filter by minimum score
    filtered_scores = [(i, score) for i, score in sim_scores if score >= min_score]
    
    # Sort by similarity score (descending)
    filtered_scores = sorted(filtered_scores, key=lambda x: x[1], reverse=True)
    
    # Get top n matches
    top_indices = [i[0] for i in filtered_scores[:n]]
    top_scores = [i[1] for i in filtered_scores[:n]]
    
    # Get student details for matches
    matches = df.iloc[top_indices].copy()
    matches['Compatibility_Score'] = top_scores
    matches['Match_Percentage'] = matches['Compatibility_Score'] * 100
    
    # Select relevant columns to display
    display_columns = ['Name', 'Gender', 'Program of Study', 'Sleep Type', 
                      'Cleanliness Importance', 'Social Type', 'Match_Percentage']
    
    return matches[display_columns]

# Main Program

In [8]:
def main():
    # Define your filename here
    data_filename = "roommate_compatibility_dataset.csv"
    
    try:
        # Load data
        df = load_data(data_filename)
        similarity_matrix = prepare_similarity_matrix(df)
        
        print("=== Roommate Compatibility Predictor ===")
        print("Choose an option:")
        print("1. Find matches for an existing student")
        print("2. Enter your own profile to find matches")
        
        choice = input("Enter your choice (1 or 2): ").strip()
        
        if choice == '1':
            student_name = input("Enter student name (e.g., Student_1): ").strip()
            print(f"\nTop 5 compatible roommates for {student_name}:")
            matches = find_compatible_roommates(df, similarity_matrix, student_name=student_name)
            print(matches.to_string(index=False))
        elif choice == '2':
            user_df = get_user_input()
            print("\nFinding compatible roommates for you...")
            matches = find_compatible_roommates(df, similarity_matrix, user_df=user_df)
            if not matches.empty:
                print("\nYour Top Compatible Roommates:")
                print(matches.to_string(index=False))
            else:
                print("No compatible roommates found with the current criteria.")
        else:
            print("Invalid choice. Please enter 1 or 2.")
    except FileNotFoundError:
        print(f"Error: Could not find data file '{data_filename}'. Please make sure it exists.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

    print("\nSimilarity Matrix (First 5x5):\n", similarity_matrix[:5, :5])  # Preview top 5x5
if __name__ == "__main__":
    main()

=== Roommate Compatibility Predictor ===
Choose an option:
1. Find matches for an existing student
2. Enter your own profile to find matches

Please enter your roommate preferences:

Finding compatible roommates for you...

Your Top Compatible Roommates:
       Name            Gender Program of Study Sleep Type  Cleanliness Importance Social Type  Match_Percentage
 Student_45 Prefer not to say        B.Tech ME  Night Owl                       5 Introverted         76.121893
 Student_88            Female       B.Tech ECE Early Bird                       5 Introverted         76.121893
Student_109              Male       B.Tech CSE Early Bird                       5 Introverted         75.592895
 Student_66              Male       B.Tech ECE Early Bird                       4 Introverted         74.466871
 Student_92            Female       B.Tech CSE Early Bird                       5 Introverted         74.466871

Similarity Matrix (First 5x5):
 [[1.         0.32732684 0.38888889 0.227