Request

- User ID
- User searching country
- expecting shops count as result


Response

- listed shops id arrays

User   

userID  
Q1={'Beauty & Wellness', 'Events & Spaces'}
Q2={'Salons', 'Spa'}
Q3={'Hair cut', 'Nail Care'}
Q4=18-24
Q5=male
Q6=2000-5000
Q7=colombo


User Behavior  

ShopID  
userID  
Booking_count  
Average_Booking_cost  


Shop  

Shop ID  
ShopName  
country 
Location_name  
Location_long  
Location_lat  
Description 
Shop_categories (beauty & wellness, ...) 
Shop_categories (spa, saloon)   
Available_package_types (hair care, nail care)
Total_Paid_Ads_Count_Current_Month  
Total_Reviews_Count_Current_Month  
Total_Bookings_Count_Current_Month  
  

Shop Behaviors  

Shop_ID  
Total_Paid_Ads_Count_Current_Month  
Total_Reviews_Count_Current_Month  
Total_Bookings_Count_Current_Month

In [3]:
import pandas as pd


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neighbors import NearestNeighbors

In [4]:
user_data = pd.read_excel('Users_Data.xlsx')
user_data.head()

Unnamed: 0,UserID,Q1,Q2,Q3,Q4,Q6,Q7,Q8,Q10
0,1,{'Education & Experts'},{'Therapy'},25-34,Male,"{'Skin Care', 'Nail Care', 'Hair cut'}","LKR 5,000 - LKR 10,000",Galle,{'Other'}
1,2,{'In-Demand Ser'},{'Therapy'},45-54,Male,"{'Nail Care', 'Hair cut'}","LKR 2,000 - LKR 5,000",Negombo,{'Keratin or protein treatments'}
2,3,{'Fashion & Style'},{'Cosmetic Services'},55+,Female,"{'Nail Care', 'Hair cut'}","LKR 5,000 - LKR 10,000",Jaffna,{'Scalp treatments and massages'}
3,4,{'Fashion & Style'},{'Cosmetic Services'},35-44,Female,"{'Skin Care', 'Nail Care', 'Hair cut'}","LKR 10,000 - LKR 15,000",Nugegoda,{'Other'}
4,5,{'Education & Experts'},{'Therapy'},25-34,Male,{'Skin Care'},"LKR 20,000+",Kandy,"{'Hair colouring (e.g., balayage, highlights)'..."


### Handle Array-like Columns:

In [5]:
mlb = MultiLabelBinarizer()

Q1_binarized = pd.DataFrame(mlb.fit_transform(user_data['Q1']), columns=mlb.classes_, index=user_data.index)
Q2_binarized = pd.DataFrame(mlb.fit_transform(user_data['Q2']), columns=mlb.classes_, index=user_data.index)
Q6_binarized = pd.DataFrame(mlb.fit_transform(user_data['Q6']), columns=mlb.classes_, index=user_data.index)
Q10_binarized = pd.DataFrame(mlb.fit_transform(user_data['Q10']), columns=mlb.classes_, index=user_data.index)
# Add binarized data to user_data
user_data = pd.concat([user_data, Q1_binarized, Q2_binarized,Q6_binarized,Q10_binarized], axis=1)


In [6]:
user_data['Q3'] = user_data['Q3'].astype('category').cat.codes
user_data['Q4'] = user_data['Q4'].astype('category').cat.codes
user_data['Q7'] = user_data['Q7'].astype('category').cat.codes
user_data['Q8'] = user_data['Q8'].astype('category').cat.codes

In [7]:
user_data = user_data.drop(['Q1', 'Q2', 'Q6', 'Q10'], axis=1)

In [8]:
user_data.head()

Unnamed: 0,UserID,Q3,Q4,Q7,Q8,Unnamed: 6,&,',",",-,...,o,p,r,s,t,u,w,y,{,}
0,1,1,1,4,2,1,1,1,0,0,...,0,0,1,0,1,0,0,0,1,1
1,2,3,1,2,6,1,0,1,0,1,...,1,1,1,1,1,0,0,0,1,1
2,3,4,0,4,3,1,1,1,0,0,...,0,1,1,1,1,0,0,0,1,1
3,4,2,0,0,7,1,1,1,0,0,...,0,0,1,0,1,0,0,0,1,1
4,5,1,1,3,4,1,1,1,0,0,...,1,0,1,1,1,1,1,1,1,1


In [9]:
user_data.shape


(100, 119)

### Define feature columns

In [10]:
X = user_data.drop(['UserID'], axis=1)  # Features
user_ids = user_data['UserID']  # User IDs


In [11]:
X.shape

(100, 118)

In [12]:
print("Training Features (X):", X.columns.tolist())

Training Features (X): ['Q3', 'Q4', 'Q7', 'Q8', ' ', '&', "'", ',', '-', 'B', 'D', 'E', 'F', 'H', 'I', 'M', 'S', 'T', 'W', 'a', 'c', 'd', 'e', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'x', 'y', '{', '}', ' ', "'", ',', 'B', 'C', 'F', 'S', 'T', 'a', 'c', 'd', 'e', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'v', 'y', '{', '}', ' ', "'", ',', 'C', 'H', 'N', 'S', 'a', 'c', 'e', 'i', 'k', 'l', 'n', 'r', 't', 'u', '{', '}', ' ', "'", '(', ')', ',', '.', 'B', 'C', 'H', 'K', 'O', 'S', 'a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'w', 'y', '{', '}']


In [13]:
print("Duplicate columns in X:", X.columns[X.columns.duplicated()].tolist())

Duplicate columns in X: [' ', "'", ',', 'B', 'F', 'S', 'T', 'a', 'c', 'd', 'e', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'v', 'y', '{', '}', ' ', "'", ',', 'C', 'H', 'S', 'a', 'c', 'e', 'i', 'l', 'n', 'r', 't', 'u', '{', '}', ' ', "'", ',', 'B', 'C', 'H', 'S', 'a', 'c', 'd', 'e', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'y', '{', '}']


In [14]:
X = X.loc[:, ~X.columns.duplicated()]

In [15]:
print("Duplicate columns in X (after deduplication):", X.columns[X.columns.duplicated()].tolist())

Duplicate columns in X (after deduplication): []


In [16]:
X.shape

(100, 50)

In [17]:
knn_model = NearestNeighbors(n_neighbors=10, metric='cosine')
knn_model.fit(X)


In [34]:
new_user_data = {
    'Q1': {'Beauty & Wellness', 'Travel & Stay'},
    'Q2': {'Spa', 'Salon'},
    'Q3': '18-24',
    'Q4': 'Female',
    'Q6' : {'Skin Care', 'Nail Care', 'Hair cut'},
    'Q7': 'LKR 10,000 - LKR 15,000',
    'Q8': 'Galle',
    'Q10': {'Classic manicures and pedicures'}
}

new_user = pd.DataFrame([new_user_data])

Q1_binarized = pd.DataFrame(mlb.transform(new_user['Q1']), columns=mlb.classes_)
Q2_binarized = pd.DataFrame(mlb.transform(new_user['Q2']), columns=mlb.classes_)
Q6_binarized = pd.DataFrame(mlb.transform(new_user['Q6']), columns=mlb.classes_)
Q10_binarized = pd.DataFrame(mlb.transform(new_user['Q10']), columns=mlb.classes_)

new_user = pd.concat([new_user, Q1_binarized, Q2_binarized,Q6_binarized,Q10_binarized], axis=1)

# Convert to the same format as training data

new_user['Q3'] = new_user['Q3'].astype('category').cat.codes
new_user['Q4'] = new_user['Q4'].astype('category').cat.codes
new_user['Q7'] = new_user['Q7'].astype('category').cat.codes
new_user['Q8'] = new_user['Q8'].astype('category').cat.codes



# Drop original columns after encoding
new_user = new_user.drop(['Q1', 'Q2', 'Q6', 'Q10'], axis=1)





In [35]:
print("New User Features:", new_user.columns.tolist())

New User Features: ['Q3', 'Q4', 'Q7', 'Q8', ' ', "'", '(', ')', ',', '.', 'B', 'C', 'H', 'K', 'O', 'S', 'a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'w', 'y', '{', '}', ' ', "'", '(', ')', ',', '.', 'B', 'C', 'H', 'K', 'O', 'S', 'a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'w', 'y', '{', '}', ' ', "'", '(', ')', ',', '.', 'B', 'C', 'H', 'K', 'O', 'S', 'a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'w', 'y', '{', '}', ' ', "'", '(', ')', ',', '.', 'B', 'C', 'H', 'K', 'O', 'S', 'a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'w', 'y', '{', '}']


In [36]:
# Check for duplicates in new user data columns
print("Duplicate columns in new_user:", new_user.columns[new_user.columns.duplicated()].tolist())


Duplicate columns in new_user: [' ', "'", '(', ')', ',', '.', 'B', 'C', 'H', 'K', 'O', 'S', 'a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'w', 'y', '{', '}', ' ', "'", '(', ')', ',', '.', 'B', 'C', 'H', 'K', 'O', 'S', 'a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'w', 'y', '{', '}', ' ', "'", '(', ')', ',', '.', 'B', 'C', 'H', 'K', 'O', 'S', 'a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'w', 'y', '{', '}']


In [37]:

new_user = new_user.loc[:, ~new_user.columns.duplicated()]


In [38]:
print("Duplicate columns in new_user (after deduplication):", new_user.columns[new_user.columns.duplicated()].tolist())

Duplicate columns in new_user (after deduplication): []


In [39]:
new_user.shape

(1, 38)

In [40]:
print("Columns in X:", X.columns.tolist())
print("Columns in new_user:", new_user.columns.tolist())


Columns in X: ['Q3', 'Q4', 'Q7', 'Q8', ' ', '&', "'", ',', '-', 'B', 'D', 'E', 'F', 'H', 'I', 'M', 'S', 'T', 'W', 'a', 'c', 'd', 'e', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'x', 'y', '{', '}', 'C', 'N', 'k', '(', ')', '.', 'K', 'O', 'b', 'g', 'w']
Columns in new_user: ['Q3', 'Q4', 'Q7', 'Q8', ' ', "'", '(', ')', ',', '.', 'B', 'C', 'H', 'K', 'O', 'S', 'a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'w', 'y', '{', '}']


In [41]:
print("Number of features in X:", X.shape[1])
print("Columns in X:", X.columns.tolist())


Number of features in X: 50
Columns in X: ['Q3', 'Q4', 'Q7', 'Q8', ' ', '&', "'", ',', '-', 'B', 'D', 'E', 'F', 'H', 'I', 'M', 'S', 'T', 'W', 'a', 'c', 'd', 'e', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'x', 'y', '{', '}', 'C', 'N', 'k', '(', ')', '.', 'K', 'O', 'b', 'g', 'w']


In [42]:
print("Number of features in new_user:", new_user.shape[1])
print("Columns in new_user:", new_user.columns.tolist())


Number of features in new_user: 38
Columns in new_user: ['Q3', 'Q4', 'Q7', 'Q8', ' ', "'", '(', ')', ',', '.', 'B', 'C', 'H', 'K', 'O', 'S', 'a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'w', 'y', '{', '}']


In [43]:
new_user = new_user.reindex(columns=X.columns, fill_value=0)


In [44]:
print("Number of features in new_user (after reindex):", new_user.shape[1])
print("Columns in new_user (after reindex):", new_user.columns.tolist())


Number of features in new_user (after reindex): 50
Columns in new_user (after reindex): ['Q3', 'Q4', 'Q7', 'Q8', ' ', '&', "'", ',', '-', 'B', 'D', 'E', 'F', 'H', 'I', 'M', 'S', 'T', 'W', 'a', 'c', 'd', 'e', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'x', 'y', '{', '}', 'C', 'N', 'k', '(', ')', '.', 'K', 'O', 'b', 'g', 'w']


In [45]:
print("Number of features used to fit knn_model:", knn_model.n_features_in_)


Number of features used to fit knn_model: 50


In [46]:
new_user_array = new_user.values.reshape(1, -1)


In [47]:
new_user_array.shape

(1, 50)

In [49]:
distances, indices = knn_model.kneighbors(new_user_array)

# Retrieve similar user IDs
similar_users = user_ids.iloc[indices[0]].values
print("Similar Users:", similar_users)


Similar Users: [70 71 67 68 69 65 74 73 63 72]


