In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('out.csv')

In [3]:
print(df.columns)

Index([' Uniq Id', 'Crawl Timestamp', 'Dataset Origin', 'Product Id',
       'Product Barcode', 'Product Company Type Source',
       'Product Brand Source', 'Product Brand Normalised Source',
       'Product Name Source', 'Match Rank', 'Match Score', 'Match Type',
       'Retailer', 'Product Category', 'Product Brand', 'Product Name',
       'Product Price', 'Sku', 'Upc', 'Product Url', 'Market',
       'Product Description', 'Product Currency',
       'Product Available Inventory', 'Product Image Url',
       'Product Model Number', 'Product Tags', 'Product Contents',
       'Product Rating', 'Product Reviews Count', 'Bsr', 'Joining Key'],
      dtype='object')


In [4]:
print(df.info())        
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29972 entries, 0 to 29971
Data columns (total 32 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0    Uniq Id                         29972 non-null  object 
 1   Crawl Timestamp                  29972 non-null  object 
 2   Dataset Origin                   0 non-null      float64
 3   Product Id                       29972 non-null  object 
 4   Product Barcode                  0 non-null      float64
 5   Product Company Type Source      29972 non-null  object 
 6   Product Brand Source             88 non-null     object 
 7   Product Brand Normalised Source  88 non-null     object 
 8   Product Name Source              88 non-null     object 
 9   Match Rank                       0 non-null      float64
 10  Match Score                      0 non-null      float64
 11  Match Type                       0 non-null      float64
 12  Retailer          

In [5]:
cols_to_keep = [
    'Product Name', 'Product Description', 'Product Price', 'Product Rating',
    'Product Reviews Count', 'Product Category', 'Product Brand',
    'Product Image Url', 'Product Tags', 'Product Url'
]

df_clean = df[cols_to_keep].copy()

In [6]:
print(df_clean.info())        
print(df_clean.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29972 entries, 0 to 29971
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Product Name           29972 non-null  object 
 1   Product Description    20467 non-null  object 
 2   Product Price          29193 non-null  object 
 3   Product Rating         9613 non-null   float64
 4   Product Reviews Count  9464 non-null   object 
 5   Product Category       28872 non-null  object 
 6   Product Brand          18367 non-null  object 
 7   Product Image Url      28609 non-null  object 
 8   Product Tags           16302 non-null  object 
 9   Product Url            29972 non-null  object 
dtypes: float64(1), object(9)
memory usage: 2.3+ MB
None
       Product Rating
count     9613.000000
mean         3.830105
std          0.655683
min          1.000000
25%          3.500000
50%          3.900000
75%          4.200000
max          5.000000


In [7]:
# Clean column names
df_clean.columns = [col.strip().lower().replace(' ', '_') for col in df_clean.columns]

# Fill missing text fields
for col in ['product_name', 'product_description', 'product_tags']:
    df_clean[col] = df_clean[col].fillna('')

# Fill missing brand/category/image_url with 'Unknown'
for col in ['product_brand', 'product_category', 'product_image_url']:
    df_clean[col] = df_clean[col].fillna('Unknown')

# Convert product price to float
df_clean['product_price'] = pd.to_numeric(
    df_clean['product_price'].str.replace(',', '').str.extract('(\d+\.?\d*)')[0],
    errors='coerce'
)

# Convert reviews count to int
df_clean['product_reviews_count'] = pd.to_numeric(
    df_clean['product_reviews_count'].str.replace(',', ''),
    errors='coerce'
).fillna(0).astype(int)

# Fill missing ratings with 0 (or use median if you prefer)
df_clean['product_rating'] = df_clean['product_rating'].fillna(0)

# Combine fields for content-based filtering
df_clean['text_data'] = (
    df_clean['product_name'] + ' ' +
    df_clean['product_description'] + ' ' +
    df_clean['product_tags'] + ' ' +
    df_clean['product_category'] + ' ' +
    df_clean['product_brand']
)

# Save cleaned version
df_clean.to_csv("cleaned_products12.csv", index=False)

# Quick preview
print(df_clean[['product_name', 'product_price', 'product_rating', 'product_reviews_count']].head(3))

  df_clean['product_price'].str.replace(',', '').str.extract('(\d+\.?\d*)')[0],


                                        product_name  product_price  \
0         Philips 9 W Standard B22 LED Bulb  (White)          137.0   
1  Havells 15 W Standard B22 LED Bulb  (White, Pa...          724.0   
2  Symphony 12 L Room/Personal Air Cooler  (White...         5299.0   

   product_rating  product_reviews_count  
0             0.0                      0  
1             4.1                     53  
2             3.8                   1246  


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load cleaned data
df_clean = pd.read_csv("cleaned_products.csv")

# TF-IDF vectorization of text data
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df_clean['text_data'])

# Cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a reverse mapping of product name to index
indices = pd.Series(df_clean.index, index=df_clean['product_name']).drop_duplicates()


In [9]:

def get_recommendations(product_name, num_recommendations=5):
    # Convert both the input and index to lowercase for flexible matching
    product_name = product_name.lower()
    product_names_lower = df_clean['product_name'].str.lower()

    if product_name not in product_names_lower.values:
        print("Product not found in database.")
        return []

    idx = product_names_lower[product_names_lower == product_name].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    product_indices = [i[0] for i in sim_scores]
    return df_clean[['product_name', 'product_price', 'product_rating', 'product_url']].iloc[product_indices]


In [10]:
recommendations = get_recommendations("Blue Heaven EYE LINER AS SOFT KAJAL (GREEN) 0.31 g\xa0\xa0(GREEN)", 5)
print(recommendations)

                                            product_name  product_price  \
24693  Blue Heaven SOFT KAJAL AS EYE LINER 0.31 g  (B...          285.0   
27116  Blue Heaven SOFT KAJAL AS EYE LINER 0.31 g  (B...          220.0   
14688  Blue Heaven KAJAL,LINER,1+1  (2 Items in the set)          193.0   
21167  Blue Heaven Colour Matte Eyeliner 01 Blue (6 M...          104.0   
3500   Blue Heaven Eyeliner & Mascara (4 types) 8 ml ...          994.0   

       product_rating                                        product_url  
24693             4.2  https://www.flipkart.com/blue-heaven-soft-kaja...  
27116             4.2  https://www.flipkart.com/blue-heaven-soft-kaja...  
14688             4.0  https://www.flipkart.com/blue-heaven-kajal-lin...  
21167             3.9  https://www.flipkart.com/blue-heaven-colour-ma...  
3500              0.0  https://www.flipkart.com/blue-heaven-eyeliner-...  


In [11]:
# Show a few sample product names
print(df_clean['product_name'].sample(10).to_list())


['Abbott Freestyle 100 Glucometer Strips', 'kakapharmacy herbal-hair-gel-250x250 pack of 4 Hair Gel\xa0\xa0(120 g)', 'Devansh Pendants Ceiling Lamp', 'Food ARC Panch Tulsi Drops - Tulsi Ras - Tulsi Extract - Tulsi Leaves Extract (Pack Of 2)\xa0\xa0(30 ml)', 'GDP Analog tamba colour Clock', "L'Oreal Paris hair spa deep nourishing shampoo and conditioner\xa0\xa0(2 Items in the set)", 'Belza Honey Soap 200 gram ( PACK OF 2 )\xa0\xa0(2 x 100 g)', 'Mobone Zc 5-In-1 Smoothing Body Face Beauty Care Facial Massager\xa0\xa0(Pink)', 'Khadi Pure Herbal Jasmine And Lemon Soap Combo\xa0\xa0(2 x 125 g)', 'Maxel 201b men Runtime: 30 min Trimmer for Men\xa0\xa0(Red)']


In [12]:
import pandas as pd
import numpy as np

# Load your product data
product_df = pd.read_csv("cleaned_products.csv")  # your original CSV

# Create synthetic users
num_users = 100
users = [f"user_{i}" for i in range(1, num_users + 1)]

# Product names (or use Product IDs if available)
products = product_df['product_name'].unique()

# Generate synthetic ratings
synthetic_data = []

np.random.seed(42)  # for reproducibility
for user in users:
    sampled_products = np.random.choice(products, size=10, replace=False)
    for product in sampled_products:
        rating = np.random.randint(1, 6)  # Ratings between 1 and 5
        synthetic_data.append([user, product, rating])

# Create a DataFrame
ratings_df = pd.DataFrame(synthetic_data, columns=["user_id", "product_name", "rating"])

# Save to CSV (optional)
ratings_df.to_csv("synthetic_user_ratings.csv", index=False)
print("✅ Synthetic user ratings saved.")


✅ Synthetic user ratings saved.


In [13]:
import pandas as pd
from surprise import Dataset, Reader

# Load the synthetic ratings CSV
ratings_df = pd.read_csv("synthetic_user_ratings.csv")

# Check format: should have 'user_id', 'product_name', 'rating'
ratings_df.head()


Unnamed: 0,user_id,product_name,rating
0,user_1,"HOKIPO Microfiber Bathroom Mat (Blue, Medium)",4
1,user_1,Hawkins & Brimble Face Wash (150 ml),1
2,user_1,SkyKross Cotton Animal Diwan Set,4
3,user_1,"JVL Mini Designer Bowl Set, 2-Pieces Stainless...",1
4,user_1,TRINITY E STORE - 250 ml Plastic Grocery Conta...,4


In [14]:
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Define rating scale (same as you used when generating data)
reader = Reader(rating_scale=(1, 5))

# Load into Surprise's format
data = Dataset.load_from_df(ratings_df[['user_id', 'product_name', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


In [15]:
# Use SVD (matrix factorization)
model = SVD()
model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1dce54fdbb0>

In [16]:
predictions = model.test(testset)
print("RMSE:", accuracy.rmse(predictions))


RMSE: 1.4831
RMSE: 1.4831123444994903


In [17]:

def get_top_n_recommendations(user_id, n=5):
    all_products = ratings_df['product_name'].unique()
    rated = ratings_df[ratings_df['user_id'] == user_id]['product_name'].tolist()
    unrated = [p for p in all_products if p not in rated]
    
    predictions = [model.predict(user_id, product) for product in unrated]
    top_n = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]
    return [(pred.iid, round(pred.est, 2)) for pred in top_n]

# Example: Top 5 recommendations for user_10
print(get_top_n_recommendations("user_10"))


[('CASANEST Tie-ups Single Size Waterproof Mattress Protector\xa0\xa0(Multicolor)', 3.42), ('Dove Hair Therapy Hair Fall Rescue Shampoo 650ml\xa0\xa0(650 ml)', 3.41), ('Trothic Gardens Rare Exotic Adenium Obesum Black Red 1 Helathy Small Seedling Plant\xa0\xa0(Hybrid, Pack of 1)', 3.4), ('BIO BRIX HEALTHCARE KETOBRIX-AF BODY WASH\xa0\xa0(100 ml)', 3.4), ('POVTY EXPORT Microfibre Solid Sleeping Pillow Pack of 2\xa0\xa0(White)', 3.39)]


In [23]:
def hybrid_recommendation(user_id, content_df, cosine_sim, top_n=5, alpha=0.5):
    all_products = content_df['product_name'].unique()
    rated = ratings_df[ratings_df['user_id'] == user_id]['product_name'].tolist()
    candidates = [p for p in all_products if p not in rated]

    scores = []
    for product in candidates:
        # Collaborative score
        try:
            collab_score = model.predict(user_id, product).est
        except:
            collab_score = 0.0
        
        # Content score: compare with all products the user rated
        user_rated = ratings_df[ratings_df['user_id'] == user_id]['product_name'].tolist()
        content_scores = [get_content_score(p, product, content_df, cosine_sim) for p in user_rated]
        avg_content_score = sum(content_scores) / len(content_scores) if content_scores else 0.0

        # Final score
        final_score = alpha * avg_content_score + (1 - alpha) * collab_score
        scores.append((product, round(final_score, 4)))

    # Sort and return top N
    top_recommendations = sorted(scores, key=lambda x: x[1], reverse=True)[:top_n]
    return top_recommendations


In [25]:
def get_content_score(target_product_name, candidate_product_name, content_df, cosine_sim):
    try:
        target_idx = content_df[content_df['Product Name'] == target_product_name].index[0]
        candidate_idx = content_df[content_df['Product Name'] == candidate_product_name].index[0]
        return cosine_sim[target_idx][candidate_idx]
    except:
        return 0.0


In [26]:
top_hybrid = hybrid_recommendation("user_10", content_df=product_df, cosine_sim=cosine_sim, alpha=0.6)
for product, score in top_hybrid:
    print(f"{product} → {score}")


CASANEST Tie-ups Single Size Waterproof Mattress Protector  (Multicolor) → 1.369
Dove Hair Therapy Hair Fall Rescue Shampoo 650ml  (650 ml) → 1.3626
BIO BRIX HEALTHCARE KETOBRIX-AF BODY WASH  (100 ml) → 1.3596
Trothic Gardens Rare Exotic Adenium Obesum Black Red 1 Helathy Small Seedling Plant  (Hybrid, Pack of 1) → 1.3596
POVTY EXPORT Microfibre Solid Sleeping Pillow Pack of 2  (White) → 1.3542


In [27]:
import pickle

with open('svd_model.pkl', 'wb') as f:
    pickle.dump(model, f)


In [28]:
ratings_df.to_csv('ratings_data.csv', index=False)

In [30]:
import numpy as np

np.save('cosine_sim.npy', cosine_sim)


In [31]:
product_df.to_csv('product_data.csv', index=False)
