<a href="https://colab.research.google.com/github/ShaikMeheqKausar/Y22CD158/blob/main/pd3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Load dataset
df = pd.read_csv('jewelry.csv', header=None)

# Rename key columns (based on your dataset structure)
df = df.rename(columns={
    1: 'user_id',
    2: 'product_id',
    5: 'category',
    7: 'price',
    10: 'color',
    11: 'material',
    12: 'gem'
})

# Keep only relevant columns
df = df[['user_id', 'product_id', 'category', 'price', 'color', 'material', 'gem']]
df.dropna(subset=['user_id', 'product_id'], inplace=True)
df.drop_duplicates(inplace=True)

# Create user-item interaction matrix
interaction_matrix = df.assign(interaction=1).pivot_table(
    index='user_id', columns='product_id', values='interaction', fill_value=0
)

# Compute user-user similarity matrix
user_similarity = cosine_similarity(interaction_matrix)
user_sim_df = pd.DataFrame(user_similarity, index=interaction_matrix.index, columns=interaction_matrix.index)

# Create content metadata for each product
df['content'] = df[['category', 'color', 'material', 'gem']].fillna('').agg(' '.join, axis=1)
product_profiles = df.groupby('product_id')['content'].first()

# TF-IDF vectorization of product content
tfidf = TfidfVectorizer()
product_tfidf = tfidf.fit_transform(product_profiles)

# Function: get collaborative filtering scores
def get_cf_scores(user_id, top_k=5):
    if user_id not in interaction_matrix.index:
        return pd.Series(dtype=float)

    similar_users = user_sim_df[user_id].sort_values(ascending=False)[1:top_k+1]
    weighted_scores = interaction_matrix.loc[similar_users.index].T.dot(similar_users)
    return weighted_scores / similar_users.sum()

# Function: get content-based scores
def get_cb_scores(user_id):
    if user_id not in interaction_matrix.index:
        return pd.Series(dtype=float)

    user_products = interaction_matrix.loc[user_id]
    liked_products = user_products[user_products > 0].index.tolist()

    if not liked_products:
        return pd.Series(dtype=float)

    liked_indices = [product_profiles.index.get_loc(pid) for pid in liked_products if pid in product_profiles.index]
    if not liked_indices:
        return pd.Series(dtype=float)

    user_profile = product_tfidf[liked_indices].mean(axis=0)
    scores = cosine_similarity(user_profile, product_tfidf).flatten()
    return pd.Series(scores, index=product_profiles.index)

# Function: hybrid recommender
def hybrid_recommend(user_id, alpha=0.6, top_n=10):
    cf_scores = get_cf_scores(user_id)
    cb_scores = get_cb_scores(user_id)

    # Normalize and align indices
    all_products = product_profiles.index
    cf_scores = cf_scores.reindex(all_products, fill_value=0)
    cb_scores = cb_scores.reindex(all_products, fill_value=0)

    final_scores = alpha * cf_scores + (1 - alpha) * cb_scores

    # Remove products already interacted with
    interacted = interaction_matrix.loc[user_id]
    recommendations = final_scores[interacted == 0].sort_values(ascending=False).head(top_n)

    return df[df['product_id'].isin(recommendations.index)][['product_id', 'category', 'color', 'material', 'gem']].drop_duplicates().assign(score=recommendations.values)

# 🔍 Example: Recommend products for a sample user
sample_user_id = interaction_matrix.index[0]  # use any valid user_id
print(f"\n🔮 Recommendations for user {sample_user_id}:\n")
print(hybrid_recommend(sample_user_id))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=['user_id', 'product_id'], inplace=True)


In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Load dataset
df = pd.read_csv('jewelry.csv', header=None)

# Rename key columns (based on your dataset structure)
df = df.rename(columns={
    1: 'user_id',
    2: 'product_id',
    5: 'category',
    7: 'price',
    10: 'color',
    11: 'material',
    12: 'gem'
})

# Keep only relevant columns
df = df[['user_id', 'product_id', 'category', 'price', 'color', 'material', 'gem']]
df = df.dropna(subset=['user_id', 'product_id']).copy()
df = df.drop_duplicates().copy()

# Create user-item interaction matrix
interaction_matrix = df.assign(interaction=1).pivot_table(
    index='user_id', columns='product_id', values='interaction', fill_value=0
)

# Compute user-user similarity matrix
user_similarity = cosine_similarity(interaction_matrix)
user_sim_df = pd.DataFrame(user_similarity, index=interaction_matrix.index, columns=interaction_matrix.index)

# Create content metadata for each product
df['content'] = df[['category', 'color', 'material', 'gem']].fillna('').agg(' '.join, axis=1)
product_profiles = df.groupby('product_id')['content'].first()

# TF-IDF vectorization of product content
tfidf = TfidfVectorizer()
product_tfidf = tfidf.fit_transform(product_profiles)

# Function: get collaborative filtering scores
def get_cf_scores(user_id, top_k=5):
    if user_id not in interaction_matrix.index:
        return pd.Series(dtype=float)

    similar_users = user_sim_df[user_id].sort_values(ascending=False)[1:top_k+1]
    weighted_scores = interaction_matrix.loc[similar_users.index].T.dot(similar_users)
    return weighted_scores / similar_users.sum()

# Function: get content-based scores
def get_cb_scores(user_id):
    if user_id not in interaction_matrix.index:
        return pd.Series(dtype=float)

    user_products = interaction_matrix.loc[user_id]
    liked_products = user_products[user_products > 0].index.tolist()

    if not liked_products:
        return pd.Series(dtype=float)

    liked_indices = [product_profiles.index.get_loc(pid) for pid in liked_products if pid in product_profiles.index]
    if not liked_indices:
        return pd.Series(dtype=float)

    user_profile = product_tfidf[liked_indices].mean(axis=0)
    scores = cosine_similarity(user_profile, product_tfidf).flatten()
    return pd.Series(scores, index=product_profiles.index)

# Function: hybrid recommender
def hybrid_recommend(user_id, alpha=0.6, top_n=10):
    cf_scores = get_cf_scores(user_id)
    cb_scores = get_cb_scores(user_id)

    # Normalize and align indices
    all_products = product_profiles.index
    cf_scores = cf_scores.reindex(all_products, fill_value=0)
    cb_scores = cb_scores.reindex(all_products, fill_value=0)

    final_scores = alpha * cf_scores + (1 - alpha) * cb_scores

    # Remove products already interacted with
    interacted = interaction_matrix.loc[user_id]
    recommendations = final_scores[interacted == 0].sort_values(ascending=False).head(top_n)

    return df[df['product_id'].isin(recommendations.index)][['product_id', 'category', 'color', 'material', 'gem']].drop_duplicates().assign(score=recommendations.values)

# 🔍 Example: Recommend products for a sample user
sample_user_id = interaction_matrix.index[0]  # pick the first user as example
print(f"\n🔮 Recommendations for user {sample_user_id}:\n")
print(hybrid_recommend(sample_user_id))
