In [None]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
print("User_Data")
users_df = pd.read_csv("Updated_Users_Dataset_with_Demographics.csv")
print("################################################################")
print("Product_Data")
products_df = pd.read_csv("products_large.csv")
print("################################################################")
rating_df = pd.read_csv("ratings_large.csv")



User_Data
################################################################
Product_Data
################################################################


In [None]:
print("User_Data")
display(users_df.head())
print("################################################################")
print("Product_Data")
display(products_df.head())
print("################################################################")
print("Rating_Data")
display(rating_df.head())


User_Data


Unnamed: 0,user_id,name,email,location,age,gender,interests
0,U0001,Melissa Reynolds,fmyers@lopez.net,New Toddview,38,Other,Books
1,U0002,Kelly Dalton,caleb40@gmail.com,Williamsshire,21,Other,Home Decor
2,U0003,Katie Garza,perezchristopher@kelly-williams.com,Christopherfort,18,Male,Gadgets
3,U0004,Alex Lyons,owensjames@hotmail.com,North Jessehaven,41,Other,Beauty
4,U0005,Peter Jones,kblankenship@yahoo.com,Pollardport,26,Other,Home Decor


################################################################
Product_Data


Unnamed: 0,product_id,name,category,price
0,P00001,Mean Thing,Books,198.7
1,P00002,End Item,Electronics,603.38
2,P00003,According Thing,Fashion,748.83
3,P00004,Government Gadget,Fashion,145.14
4,P00005,Work Gear,Fashion,79.11


################################################################
Rating_Data


Unnamed: 0,user_id,product_id,rating,timestamp
0,U0085,P00406,2,2025-01-29T11:29:21
1,U0019,P00055,4,2025-01-05T09:41:31
2,U0045,P00065,4,2025-01-09T18:28:30
3,U0040,P00250,2,2025-04-14T10:11:50
4,U0084,P00120,5,2025-04-02T19:41:33


In [None]:
print("Users_Data_Shape")
print(users_df.shape)
print("Rating_Data_Shape")
print(rating_df.shape)
print("Product_Data_Shape")
print(products_df.shape)


Users_Data_Shape
(100, 7)
Rating_Data_Shape
(5000, 4)
Product_Data_Shape
(500, 4)


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# 📥 Load datasets
users_df = pd.read_csv("Updated_Users_Dataset_with_Demographics.csv")
products_df = pd.read_csv("products_large.csv")
ratings_df = pd.read_csv("ratings_large.csv")

# 1️⃣ --- PREPROCESSING EXISTING USERS ---

# ✅ Gender One-Hot Encoding
gender_encoded = pd.get_dummies(users_df['gender'], prefix='gender')

# ✅ Location Manual Mapping (Label Encoding Alternative)
unique_locations = users_df['location'].unique()
location_mapping = {loc: idx for idx, loc in enumerate(unique_locations)}
users_df['location_encoded'] = users_df['location'].map(location_mapping)

# ✅ Age Scaling
scaler = MinMaxScaler()
users_df['age_scaled'] = scaler.fit_transform(users_df[['age']])

# ✅ Interests One-Hot (assumes comma-separated interests)
interests_split = users_df['interests'].str.get_dummies(sep=',')

# ✅ Combine All Features into Final Vector
user_features = pd.concat([
    users_df[['user_id', 'location_encoded', 'age_scaled']],
    gender_encoded,
    interests_split
], axis=1)

# 2️⃣ --- FUNCTION TO RECOMMEND PRODUCTS FOR A NEW USER ---
def recommend_products_for_new_user(new_user_dict, top_n=5):
    new_user_df = pd.DataFrame([new_user_dict])

    # 🧠 Location Encoding (safe for unseen values)
    new_user_df['location_encoded'] = new_user_df['location'].apply(
        lambda x: location_mapping.get(x, -1)  # Use -1 if unseen location
    )

    # 🧠 Age Scaling using fitted scaler
    new_user_df['age_scaled'] = scaler.transform(new_user_df[['age']])

    # 🧠 Gender Encoding (aligning with training)
    gender_enc = pd.get_dummies(new_user_df['gender'], prefix='gender')
    for col in gender_encoded.columns:
        if col not in gender_enc:
            gender_enc[col] = 0
    gender_enc = gender_enc[gender_encoded.columns]

    # 🧠 Interests Encoding (aligning with training)
    interests_enc = new_user_df['interests'].str.get_dummies(sep=',')
    for col in interests_split.columns:
        if col not in interests_enc:
            interests_enc[col] = 0
    interests_enc = interests_enc[interests_split.columns]

    # ✅ Final New User Vector
    new_user_vector = pd.concat([
        new_user_df[['location_encoded', 'age_scaled']],
        gender_enc,
        interests_enc
    ], axis=1).values

    # 3️⃣ --- COSINE SIMILARITY with existing users ---
    existing_vectors = user_features.drop('user_id', axis=1).values
    similarities = cosine_similarity(new_user_vector, existing_vectors)[0]
    user_features['similarity'] = similarities

    # 4️⃣ --- Top-K similar users ---
    top_users = user_features.sort_values(by='similarity', ascending=False).head(10)
    similar_user_ids = top_users['user_id'].values
    rated_by_similar = ratings_df[ratings_df['user_id'].isin(similar_user_ids)]

    # 5️⃣ --- Weighted product scores ---
    merged = rated_by_similar.merge(top_users[['user_id', 'similarity']], on='user_id')
    merged['weighted_rating'] = merged['rating'] * merged['similarity']

    # 6️⃣ --- Aggregate & Recommend ---
    recommendations = merged.groupby('product_id').agg({
        'weighted_rating': 'sum'
    }).reset_index().sort_values(by='weighted_rating', ascending=False)

    return products_df[products_df['product_id'].isin(recommendations.head(top_n)['product_id'])]

# 🎯 --- EXAMPLE NEW USER INPUT ---
new_user = {
    "age": 25,
    "gender": "Male",
    "location": "Delhi",  # Even if not in training data, will be safely handled
    "interests": "Fashion"
}

# 🔥 --- GET RECOMMENDATIONS ---
recommended_products = recommend_products_for_new_user(new_user)
display(recommended_products)


Unnamed: 0,product_id,name,category,price
144,P00145,Serve Gear,Electronics,788.65
176,P00177,Pattern Item,Beauty,476.76
306,P00307,Operation Device,Electronics,378.92
435,P00436,Either Gear,Books,631.67
461,P00462,Road Device,Books,390.64


In [None]:
import pickle

class ColdStartRecommender:
    def __init__(self, user_features, products_df, ratings_df,
                 scaler, location_mapping, gender_cols, interest_cols):
        self.user_features = user_features
        self.products_df = products_df
        self.ratings_df = ratings_df
        self.scaler = scaler
        self.location_mapping = location_mapping
        self.gender_cols = gender_cols
        self.interest_cols = interest_cols

    def recommend(self, new_user_dict, top_n=5):
        import pandas as pd
        from sklearn.metrics.pairwise import cosine_similarity

        new_user_df = pd.DataFrame([new_user_dict])
        new_user_df['location_encoded'] = new_user_df['location'].apply(
            lambda x: self.location_mapping.get(x, -1)
        )
        new_user_df['age_scaled'] = self.scaler.transform(new_user_df[['age']])

        gender_enc = pd.get_dummies(new_user_df['gender'], prefix='gender')
        for col in self.gender_cols:
            if col not in gender_enc:
                gender_enc[col] = 0
        gender_enc = gender_enc[self.gender_cols]

        interests_enc = new_user_df['interests'].str.get_dummies(sep=',')
        for col in self.interest_cols:
            if col not in interests_enc:
                interests_enc[col] = 0
        interests_enc = interests_enc[self.interest_cols]

        new_user_vector = pd.concat([
            new_user_df[['location_encoded', 'age_scaled']],
            gender_enc,
            interests_enc
        ], axis=1).values

        existing_vectors = self.user_features.drop('user_id', axis=1).values
        similarities = cosine_similarity(new_user_vector, existing_vectors)[0]
        self.user_features['similarity'] = similarities

        top_users = self.user_features.sort_values(by='similarity', ascending=False).head(10)
        similar_user_ids = top_users['user_id'].values
        rated_by_similar = self.ratings_df[self.ratings_df['user_id'].isin(similar_user_ids)]

        merged = rated_by_similar.merge(top_users[['user_id', 'similarity']], on='user_id')
        merged['weighted_rating'] = merged['rating'] * merged['similarity']

        recommendations = merged.groupby('product_id').agg({
            'weighted_rating': 'sum'
        }).reset_index().sort_values(by='weighted_rating', ascending=False)

        return self.products_df[self.products_df['product_id'].isin(
            recommendations.head(top_n)['product_id']
        )]


# 🎯 Save all required components
recommender = ColdStartRecommender(
    user_features=user_features,
    products_df=products_df,
    ratings_df=ratings_df,
    scaler=scaler,
    location_mapping=location_mapping,
    gender_cols=gender_encoded.columns.tolist(),
    interest_cols=interests_split.columns.tolist()
)

# 💾 Save as pickle
with open("cold_start_recommender.pkl", "wb") as f:
    pickle.dump(recommender, f)

print("✅ Recommender model saved as cold_start_recommender.pkl")


✅ Recommender model saved as cold_start_recommender.pkl


In [None]:

age_scaled = user['age'] / users_df['age'].max()

gender_encoded = gender_encoder.transform([[user['gender']]])[0]
interest_encoded = interest_encoder.transform([[user['interests']]])[0]

user_vector = np.concatenate([[age_scaled], gender_encoded, interest_encoded])




In [None]:
# 💰 Step 6: Build Product Metadata Vector (scaled price + one-hot category)
price_scaled = product['price'] / products_df['price'].max()

category_encoded = category_encoder.transform([[product['category']]])[0]

product_vector = np.concatenate([[price_scaled], category_encoded])




In [None]:
# 📏 Step 7: Align Vector Lengths (pad shorter one with 0s)
if len(user_vector) > len(product_vector):
    product_vector = np.pad(product_vector, (0, len(user_vector) - len(product_vector)))
elif len(product_vector) > len(user_vector):
    user_vector = np.pad(user_vector, (0, len(product_vector) - len(user_vector)))


In [None]:
# 📊 Step 8: Calculate Cosine Similarity (Content-Based Score)
similarity_score = cosine_similarity([user_vector], [product_vector])[0][0]


In [None]:
# 🧾 Step 9: Output Results
print("👤 User Vector:", user_vector)
print("📦 Product Vector:", product_vector)
print("✅ Predicted Rating (Cosine Similarity):", similarity_score)


👤 User Vector: [0.84444444 0.         0.         1.         0.         1.
 0.         0.         0.         0.         0.        ]
📦 Product Vector: [0.19882327 0.         1.         0.         0.         0.
 0.         0.         0.         0.         0.        ]
✅ Predicted Rating (Cosine Similarity): 0.09997417383810635


In [None]:
def predict_rating(user_row, product_row):
    # Encode user
    age_scaled = user_row['age'] / users_df['age'].max()
    gender_encoded = gender_encoder.transform([[user_row['gender']]])[0]
    interest_encoded = interest_encoder.transform([[user_row['interests']]])[0]
    user_vec = np.concatenate([[age_scaled], gender_encoded, interest_encoded])

    # Encode product
    price_scaled = product_row['price'] / products_df['price'].max()
    category_encoded = category_encoder.transform([[product_row['category']]])[0]
    prod_vec = np.concatenate([[price_scaled], category_encoded])

    # Pad
    if len(user_vec) > len(prod_vec):
        prod_vec = np.pad(prod_vec, (0, len(user_vec) - len(prod_vec)))
    elif len(prod_vec) > len(user_vec):
        user_vec = np.pad(user_vec, (0, len(prod_vec) - len(user_vec)))

    # Cosine similarity
    sim = cosine_similarity([user_vec], [prod_vec])[0][0]
    return sim


In [None]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import numpy as np

# Age Scaling
age_scaler = MinMaxScaler()
users_df['age_scaled'] = age_scaler.fit_transform(users_df[['age']])

# One-Hot Encoding
gender_enc = OneHotEncoder(sparse_output=False)
interests_enc = OneHotEncoder(sparse_output=False)
category_enc = OneHotEncoder(sparse_output=False)

gender_vec = gender_enc.fit_transform(users_df[['gender']])
interest_vec = interests_enc.fit_transform(users_df[['interests']])
category_vec = category_enc.fit_transform(products_df[['category']])

# Final User Profile Vectors
user_profile_vectors = np.hstack([users_df[['age_scaled']].values, gender_vec, interest_vec])

# Final Product Feature Vectors
product_feature_vectors = np.hstack([products_df[['price']].values, category_vec])


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def predict_new_item(user_rated_items, new_item_vector, rated_vectors, ratings):
    sims = cosine_similarity([new_item_vector], rated_vectors)[0]
    if np.sum(sims) == 0:
        return np.mean(ratings)  # fallback
    return np.dot(sims, ratings) / np.sum(sims)


In [None]:
def recommend_for_new_user(user_vector, product_vectors, top_k=5):
    sims = cosine_similarity([user_vector], product_vectors)[0]
    top_indices = np.argsort(sims)[-top_k:][::-1]
    return products_df.iloc[top_indices][['product_id', 'name', 'category', 'price']]


In [None]:
def fallback_popular_items(top_n=5):
    return products_df.sample(top_n)  # You can use most frequent category as a filter


In [None]:
# User interest vector
user_interest_vec = interests_enc.transform([["Gadgets"]])  # shape (1, N)

# Product category vectors (already encoded)
# → Let's regenerate it to ensure match

product_category_vec = category_enc.transform(products_df[['category']])




In [None]:
def recommend_by_interest_only(user_interest_vector, product_category_vectors, top_k=5):
    sims = cosine_similarity(user_interest_vector, product_category_vectors)[0]
    top_indices = np.argsort(sims)[-top_k:][::-1]
    return products_df.iloc[top_indices][['product_id', 'name', 'category', 'price']]


In [None]:
# Gadgets is user interest
user_interest_vec = interests_enc.transform([["Gadgets"]])
product_category_vec = category_enc.transform(products_df[['category']])

recommend_by_interest_only(user_interest_vec, product_category_vec)




ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 7 while Y.shape[1] == 6

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Refit category encoder to align dimensions
category_enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
category_vec = category_enc.fit_transform(products_df[['category']])

# Refit interest encoder similarly
interests_enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
interests_enc.fit(users_df[['interests']])
user_interest_vec = interests_enc.transform([['Gadgets']])


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'

In [None]:
# 📦 Imports
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 🔹 Load Datasets
users_df = pd.read_csv("Updated_Users_Dataset_with_Demographics.csv")
products_df = pd.read_csv("products_large.csv")

# 🧹 Optional: View columns for understanding
# print(users_df.columns)
# print(products_df.columns)

# ✅ Step 1: Fit OneHotEncoder on product categories
category_enc = OneHotEncoder(handle_unknown='ignore')
category_vec = category_enc.fit_transform(products_df[['category']]).toarray()

# ✅ Step 2: Fit OneHotEncoder on user interests
interest_enc = OneHotEncoder(handle_unknown='ignore')
interest_enc.fit(users_df[['interests']])

# 🚀 Step 3: Simulate a New User with interest = 'Gadgets'
new_user_interest = [['Gadgets']]  # Change as needed
user_interest_vec = interest_enc.transform(new_user_interest).toarray()

# ✅ Step 4: Compute cosine similarity between user interest & product category
def recommend_by_interest_only(user_interest_vector, product_category_vectors, top_k=5):
    sims = cosine_similarity(user_interest_vector, product_category_vectors)[0]
    top_indices = np.argsort(sims)[-top_k:][::-1]  # Top K most similar products
    return products_df.iloc[top_indices][['product_id', 'name', 'category', 'price']]

# 📊 Step 5: Get top K recommendations
product_category_vec = category_enc.transform(products_df[['category']]).toarray()
recommendations = recommend_by_interest_only(user_interest_vec, product_category_vec, top_k=5)

# 🖨️ Step 6: Show result
print("🎯 Recommended Products for New User (Interest: 'Gadgets'):\n")
print(recommendations)




ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 7 while Y.shape[1] == 6

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

users_df = pd.read_csv("Updated_Users_Dataset_with_Demographics.csv")
products_df = pd.read_csv("products_large.csv")


all_labels = pd.DataFrame({'label': pd.concat([users_df['interests'], products_df['category']]).unique()})


shared_encoder = OneHotEncoder(handle_unknown='ignore')
shared_encoder.fit(all_labels[['label']])


product_categories = products_df[['category']].rename(columns={'category': 'label'})
product_category_vec = shared_encoder.transform(product_categories).toarray()


new_user_interest = [['Gadgets']]
user_interest_df = pd.DataFrame(new_user_interest, columns=['label'])
user_interest_vec = shared_encoder.transform(user_interest_df).toarray()

def recommend_by_interest_only(user_interest_vector, product_category_vectors, top_k=5):
    sims = cosine_similarity(user_interest_vector, product_category_vectors)[0]
    top_indices = np.argsort(sims)[-top_k:][::-1]
    return products_df.iloc[top_indices][['product_id', 'name', 'category', 'price']]


recommendations = recommend_by_interest_only(user_interest_vec, product_category_vec)


print("🎯 Top Recommendations for New User (interest='Gadgets'):\n")
print(recommendations)


🎯 Top Recommendations for New User (interest='Gadgets'):

    product_id          name category   price
0       P00001    Mean Thing    Books  198.70
499     P00500    Town Thing   Beauty   90.69
498     P00499  Entire Thing  Fashion  158.87
497     P00498     Ago Thing    Books  606.67
16      P00017    Field Item   Beauty  452.08


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# ✅ Load your uploaded files
users_df = pd.read_csv("Updated_Users_Dataset_with_Demographics.csv")
products_df = pd.read_csv("products_large.csv")
ratings_df = pd.read_csv("ratings_large.csv")

# ✅ Gender One-Hot Encoding
gender_encoded = pd.get_dummies(users_df['gender'], prefix='gender')

# ✅ Location Encoding (manual mapping)
unique_locations = users_df['location'].unique()
location_mapping = {loc: idx for idx, loc in enumerate(unique_locations)}
users_df['location_encoded'] = users_df['location'].map(location_mapping)

# ✅ Age Scaling
scaler = MinMaxScaler()
users_df['age_scaled'] = scaler.fit_transform(users_df[['age']])

# ✅ Interests One-Hot (comma-separated)
interests_split = users_df['interests'].str.get_dummies(sep=',')

# ✅ Final Feature Vector
user_features = pd.concat([
    users_df[['user_id', 'location_encoded', 'age_scaled']],
    gender_encoded,
    interests_split
], axis=1)

# 🔍 Recommend products for new user (cold start)
def recommend_products_for_new_user(new_user_dict, top_n=5):
    new_user_df = pd.DataFrame([new_user_dict])

    # Location encode (safe)
    new_user_df['location_encoded'] = new_user_df['location'].apply(
        lambda x: location_mapping.get(x, -1)
    )

    # Age scale
    new_user_df['age_scaled'] = scaler.transform(new_user_df[['age']])

    # Gender encode
    gender_enc = pd.get_dummies(new_user_df['gender'], prefix='gender')
    for col in gender_encoded.columns:
        if col not in gender_enc:
            gender_enc[col] = 0
    gender_enc = gender_enc[gender_encoded.columns]

    # Interest encode
    interests_enc = new_user_df['interests'].str.get_dummies(sep=',')
    for col in interests_split.columns:
        if col not in interests_enc:
            interests_enc[col] = 0
    interests_enc = interests_enc[interests_split.columns]

    # Final vector
    new_user_vector = pd.concat([
        new_user_df[['location_encoded', 'age_scaled']],
        gender_enc,
        interests_enc
    ], axis=1).values

    # Cosine similarity
    existing_vectors = user_features.drop('user_id', axis=1).values
    similarities = cosine_similarity(new_user_vector, existing_vectors)[0]
    user_features['similarity'] = similarities

    # Top similar users
    top_users = user_features.sort_values(by='similarity', ascending=False).head(10)
    similar_user_ids = top_users['user_id'].values
    rated_by_similar = ratings_df[ratings_df['user_id'].isin(similar_user_ids)]

    # Weighted score
    merged = rated_by_similar.merge(top_users[['user_id', 'similarity']], on='user_id')
    merged['weighted_rating'] = merged['rating'] * merged['similarity']

    # Aggregate recommendations
    recommendations = merged.groupby('product_id').agg({
        'weighted_rating': 'sum'
    }).reset_index().sort_values(by='weighted_rating', ascending=False)

    return products_df[products_df['product_id'].isin(recommendations.head(top_n)['product_id'])]

# 🎯 Example New User
new_user = {
    "age": 25,
    "gender": "Male",
    "location": "Delhi",
    "interests": "Fashion"
}

# 🔥 Get Recommendations
recommended_products = recommend_products_for_new_user(new_user)
recommended_products


Unnamed: 0,product_id,name,category,price
144,P00145,Serve Gear,Electronics,788.65
176,P00177,Pattern Item,Beauty,476.76
306,P00307,Operation Device,Electronics,378.92
435,P00436,Either Gear,Books,631.67
461,P00462,Road Device,Books,390.64
