In [6]:
from google.colab import files
uploaded = files.upload()

Saving mock_profiles.json to mock_profiles.json


In [7]:
import json
import pandas as pd

with open("mock_profiles.json", "r") as f:
  data = json.load(f)

df = pd.DataFrame(data)

In [8]:
personality_df = pd.json_normalize(df["personality"])
df = pd.concat([df.drop(columns=["personality", "profilePicture", "completedAt", "timestamp"]), personality_df], axis=1)

In [9]:
df["age"] = df["age"].astype(int)

df.head()

Unnamed: 0,age,cleanliness,currentCity,drinking,floorPreference,foodHabits,guestPolicy,lightPreference,onboarded,preferredCity,...,roomType,sleepSchedule,smoking,verifiedOnly,windowPreference,agreeableness,conscientiousness,extroversion,neuroticism,openness
0,22,very-clean,delhi,occasionally,low,non-vegetarian,occasional,bright,True,bangalore,...,twin-sharing,flexible,regularly,True,lots-of-windows,3,3,3,3,3
1,25,moderately-clean,mumbai,never,high,vegetarian,frequent,bright,True,pune,...,twin-sharing,night-owl,never,True,lots-of-windows,3,3,3,3,3
2,23,very-clean,bangalore,regularly,mid,non-vegetarian,occasional,bright,True,delhi,...,twin-sharing,flexible,occasionally,True,lots-of-windows,3,3,3,3,3
3,21,moderately-clean,chennai,occasionally,low,vegan,rare,bright,True,mumbai,...,twin-sharing,early-riser,never,True,lots-of-windows,3,3,3,3,3
4,24,very-clean,pune,occasionally,high,non-vegetarian,frequent,bright,True,delhi,...,twin-sharing,night-owl,regularly,True,lots-of-windows,3,3,3,3,3


In [10]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [11]:
categorical_features = [
'cleanliness', 'currentCity', 'drinking', 'floorPreference',
'foodHabits', 'guestPolicy', 'lightPreference', 'preferredCity',
'profession', 'roomType', 'sleepSchedule', 'smoking', 'windowPreference'
]

numeric_features = [
'age', 'agreeableness', 'conscientiousness',
'extroversion', 'neuroticism', 'openness'
]

In [12]:
preprocessor = ColumnTransformer([
('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features),
('num', MinMaxScaler(), numeric_features)
])

In [13]:
X = preprocessor.fit_transform(df)

In [14]:
encoded_feature_names = preprocessor.get_feature_names_out()

In [16]:
import numpy as np
feature_weights = np.ones(X.shape[1])

In [17]:
for i, name in enumerate(encoded_feature_names):
  if "guestPolicy" in name or "sleepSchedule" in name:
    feature_weights[i] = 1.5

  elif "roomType" in name or "smoking" in name:
    feature_weights[i] = 1.2

  elif "profession" in name:
    feature_weights[i] = 0.7


In [18]:
weight_matrix = np.diag(feature_weights)

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
def match_user(user_dict, top_k=3):
  user_df = pd.DataFrame([user_dict])
  user_personality = pd.json_normalize(user_df["personality"])
  user_df = pd.concat([user_df.drop(columns=["personality"]), user_personality], axis=1)
  user_df["age"] = user_df["age"].astype(int)
  filtered_df = df.copy()
  if user_dict["smoking"] == "never":
      filtered_df = filtered_df[filtered_df["smoking"] != "regularly"]
  if user_dict["roomType"]:
      filtered_df = filtered_df[filtered_df["roomType"] == user_dict["roomType"]]

  if filtered_df.empty:
      return [], []
  filtered_X = preprocessor.transform(filtered_df)
  user_encoded = preprocessor.transform(user_df)
  weighted_X = filtered_X @ weight_matrix
  weighted_user = user_encoded @ weight_matrix
  sims = cosine_similarity(weighted_user, weighted_X)[0]
  top_indices = np.argsort(sims)[::-1][:top_k]

  return filtered_df.iloc[top_indices].reset_index(drop=True), sims[top_indices]



In [23]:
sample_user = {
"age": 23,
"cleanliness": "moderately-clean",
"currentCity": "delhi",
"drinking": "occasionally",
"floorPreference": "low",
"foodHabits": "non-vegetarian",
"guestPolicy": "occasional",
"lightPreference": "bright",
"preferredCity": "bangalore",
"profession": "developer",
"roomType": "twin-sharing",
"sleepSchedule": "flexible",
"smoking": "occasionally",
"windowPreference": "lots-of-windows",
"personality": {
"agreeableness": 3,
"conscientiousness": 3,
"extroversion": 3,
"neuroticism": 3,
"openness": 3
}
}

matches, scores = match_user(sample_user)

if not matches.empty:
  print("Top Matches:\n")
  for i in range(len(matches)):
    print(f"Match {i+1}:")
    print(matches.iloc[i][["currentCity", "profession", "guestPolicy", "sleepSchedule"]])
    print(f"Match Score: {round(scores[i] * 100, 2)}%\n")
else:
  print("No compatible matches found after filtering.")

Top Matches:

Match 1:
currentCity           delhi
profession        developer
guestPolicy      occasional
sleepSchedule      flexible
Name: 0, dtype: object
Match Score: 84.61%

Match 2:
currentCity       ahmedabad
profession          teacher
guestPolicy      occasional
sleepSchedule      flexible
Name: 1, dtype: object
Match Score: 69.09%

Match 3:
currentCity       bangalore
profession           artist
guestPolicy      occasional
sleepSchedule      flexible
Name: 2, dtype: object
Match Score: 65.58%



In [24]:
import joblib
joblib.dump(preprocessor, "match_pipeline.pkl")

['match_pipeline.pkl']