In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import ast

In [5]:
df = pd.read_csv("preprocessed_50user_data.csv")

In [6]:
def preprocess_user_vectors_with_weights(df):
    # Activities
    activity_cols = ['Basketball', 'Yoga', 'Hiking', 'Cycling', 'Gym', 
                     'Swimming', 'Dancing', 'Running', 'Music', 'Photography']
    activity_matrix = df[activity_cols].values.astype(float)

    # Availability (convert to numeric)
    df['Availability_Vector'] = df['Availability_Vector'].apply(ast.literal_eval)
    availability_matrix = np.array(df['Availability_Vector'].tolist()).astype(float)    

    # Languages
    all_languages = sorted({lang for sublist in df['Language_IDs'] for lang in sublist})
    language_matrix = np.zeros((len(df), len(all_languages)))
    for i, langs in enumerate(df['Language_IDs']):
        for lang in langs:
            language_matrix[i, all_languages.index(lang)] = 1
    language_matrix = language_matrix.astype(float)

    # One-hot encode Location_ID
    location_encoder = OneHotEncoder(sparse_output=False)
    location_matrix = location_encoder.fit_transform(df[['Location_ID']]).astype(float)

    # User Score
    user_score_scaled = MinMaxScaler().fit_transform(df[['User Score']]).astype(float)

    # One-hot encode Nationality_ID
    nationality_encoder = OneHotEncoder(sparse_output=False)
    nationality_matrix = nationality_encoder.fit_transform(df[['Nationality_ID']]).astype(float)

    # Gender
    gender_matrix = df[['Gender']].values.astype(float)

    # Combine everything with weights
    combined_features = np.hstack([
        activity_matrix * 3.0,
        availability_matrix * 2.0,
        language_matrix * 2.0,
        location_matrix * 1.5,
        user_score_scaled * 1.5,
        nationality_matrix * 0.5,
        gender_matrix * 0.5
    ])

    return combined_features, df['User_ID']


In [7]:
# Generate vectors and similarity matrix
user_vectors, user_ids = preprocess_user_vectors_with_weights(df)
similarity_matrix = cosine_similarity(user_vectors)

# Find Top 5 Matches per user
top_matches = {}
for idx, user_id in enumerate(user_ids):
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_users = [(int(user_ids[i]), score) for i, score in sim_scores[1:6]]  # skip self
    top_matches[int(user_id)] = top_users

In [8]:
recommendations_df = pd.DataFrame([
    {"User_ID": user, "Top_Match_ID": match[0], "Similarity_Score": match[1]}
    for user, matches in top_matches.items() for match in matches
])

def label_match(score):
    if score >= 0.85:
        return "Excellent"
    elif score >= 0.70:
        return "Good"
    elif score >= 0.50:
        return "Fair"
    else:
        return "Weak"

recommendations_df["Match_Strength"] = recommendations_df["Similarity_Score"].apply(label_match)


recommendations_df.head(10)  # Show first 10 rows

Unnamed: 0,User_ID,Top_Match_ID,Similarity_Score,Match_Strength
0,1,6,0.834976,Good
1,1,18,0.76404,Good
2,1,22,0.733683,Good
3,1,27,0.731827,Good
4,1,15,0.720962,Good
5,2,12,0.848481,Good
6,2,49,0.813737,Good
7,2,21,0.812575,Good
8,2,38,0.758286,Good
9,2,9,0.755964,Good


In [9]:
recommendations_df.to_csv("top5_user_recommendations.csv", index=False)