In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load your data
df = pd.read_csv('data.csv')

In [3]:
# Select relevant features
features = df[['college', 'Sub Category', 'rating', 'final_price']]

In [4]:
# One-hot encode categorical features
encoder = OneHotEncoder()
encoded = encoder.fit_transform(features[['college', 'Sub Category']]).toarray()

In [5]:
# Combine with numerical features
numerical = features[['rating', 'final_price']].values
X = pd.DataFrame(
    data = np.hstack((encoded, numerical)),
    index = df.index
)

In [6]:
# Normalize numerical features (optional but recommended)
scaler = StandardScaler()
X.iloc[:, -2:] = scaler.fit_transform(X.iloc[:, -2:])

In [7]:
# Compute similarity matrix
similarity_matrix = cosine_similarity(X)

def recommend(product_id, top_n=5):
    # Get the actual index of the product in the DataFrame
    if product_id not in df['id'].values:
        return []
    
    product_index = df[df['id'] == product_id].index[0]

    sim_scores = list(enumerate(similarity_matrix[product_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # exclude the product itself
    recommended_indices = [i[0] for i in sim_scores]
    return df.iloc[recommended_indices][['id', 'product', 'Sub Category', 'college', 'rating', 'final_price']].to_dict(orient='records')

In [8]:
recommend(0, top_n=5)

[{'id': 958,
  'product': 'Glasses',
  'Sub Category': 'cleaning supplies',
  'college': 'laboratory  tools',
  'rating': 3.9,
  'final_price': 3.815},
 {'id': 5517,
  'product': 'Glasses',
  'Sub Category': 'cleaning supplies',
  'college': 'laboratory  tools',
  'rating': 4.0,
  'final_price': 3.815},
 {'id': 9122,
  'product': 'Glasses',
  'Sub Category': 'cleaning supplies',
  'college': 'laboratory  tools',
  'rating': 4.0,
  'final_price': 3.815},
 {'id': 1282,
  'product': 'towel',
  'Sub Category': 'cleaning supplies',
  'college': 'laboratory  tools',
  'rating': 4.0,
  'final_price': 19.2},
 {'id': 4608,
  'product': 'towel',
  'Sub Category': 'cleaning supplies',
  'college': 'laboratory  tools',
  'rating': 4.0,
  'final_price': 19.2}]

In [9]:
import joblib

joblib.dump(df, 'products.pkl')
joblib.dump(similarity_matrix, 'similarity.pkl')

['similarity.pkl']