In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix

# Load the dataset
data = pd.read_csv('amazon.csv')

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   product_id           1465 non-null   object
 1   product_name         1465 non-null   object
 2   category             1465 non-null   object
 3   discounted_price     1465 non-null   object
 4   actual_price         1465 non-null   object
 5   discount_percentage  1465 non-null   object
 6   rating               1465 non-null   object
 7   rating_count         1463 non-null   object
 8   about_product        1465 non-null   object
 9   user_id              1465 non-null   object
 10  user_name            1465 non-null   object
 11  review_id            1465 non-null   object
 12  review_title         1465 non-null   object
 13  review_content       1465 non-null   object
 14  img_link             1465 non-null   object
 15  product_link         1465 non-null   object
dtypes: obj

In [17]:
# Step 1: Data Preprocessing
# Fill missing values
text_features = ['product_name', 'about_product', 'category']
for feature in text_features:
    data[feature] = data[feature].fillna('Unknown')
data['rating'] = pd.to_numeric(data['rating'], errors='coerce')  # Convert invalid values to NaN
data['rating'] = data['rating'].fillna(data['rating'].mean())  # Fill NaNs with the mean rating

# Step 2: Content-Based Filtering
# Create a TF-IDF vectorizer for the text data
tfidf = TfidfVectorizer(stop_words='english')

# Combine text features into a single string
data['text_features'] = data['product_name'] + ' ' + data['about_product'] + ' ' + data['category']

# Fit and transform TF-IDF
tfidf_matrix = tfidf.fit_transform(data['text_features'])

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)



In [18]:
# Content-based recommendation function
def content_based_recommendations(product_id, top_n=10):
    idx = data[data['product_id'] == product_id].index[0]
    similarity_scores = list(enumerate(cosine_sim[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similar_items = [data.iloc[i[0]].product_id for i in similarity_scores[1:top_n + 1]]
    return similar_items

# Step 3: Collaborative Filtering
# Create user-item interaction matrix
data = data.groupby(['user_id', 'product_id'], as_index=False).agg({'rating': 'mean'})
user_item_matrix = data.pivot(index='user_id', columns='product_id', values='rating').fillna(0)
# Convert to sparse matrix
user_item_sparse = csr_matrix(user_item_matrix.values)
# Perform matrix factorization using SVD
U, sigma, Vt = svds(user_item_sparse, k=50)
sigma = np.diag(sigma)
# Predict ratings
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=user_item_matrix.columns, index=user_item_matrix.index)

In [19]:
# Collaborative filtering recommendation function
def collaborative_filtering_recommendations(user_id, top_n=10):
    user_ratings = predicted_ratings_df.loc[user_id]
    top_items = user_ratings.sort_values(ascending=False).head(top_n).index.tolist()
    return top_items

# Step 4: Hybrid Recommendation System
def hybrid_recommendations(user_id, product_id, top_n=10):
    content_recs = content_based_recommendations(product_id, top_n=top_n)
    collab_recs = collaborative_filtering_recommendations(user_id, top_n=top_n)
    combined_recs = list(set(content_recs + collab_recs))[:top_n]
    return combined_recs

# Example Usage
example_user_id = data['user_id'].iloc[0]
example_product_id = data['product_id'].iloc[0]

print("Content-Based Recommendations:", content_based_recommendations(example_product_id))
print("Collaborative Filtering Recommendations:", collaborative_filtering_recommendations(example_user_id))
print("Hybrid Recommendations:", hybrid_recommendations(example_user_id, example_product_id))

Content-Based Recommendations: ['B07GPXXNNG', 'B01F25X6RQ', 'B08D6RCM3Q', 'B0B5GF6DQD', 'B0BF57RN3K', 'B09NNGHG22', 'B09QS9CWLV', 'B008QS9J6Y', 'B08Y5QJXSR', 'B0B8XNPQPN']
Collaborative Filtering Recommendations: ['B071Z8M4KX', 'B07GPXXNNG', 'B07GQD4K6L', 'B08HVL8QN3', 'B08HVJCW95', 'B08HV83HL3', 'B07WFPMGQQ', 'B07WHSJXLF', 'B07WDK3ZS2', 'B09LHYZ3GJ']
Hybrid Recommendations: ['B071Z8M4KX', 'B07WHSJXLF', 'B08HVJCW95', 'B09LHYZ3GJ', 'B0B8XNPQPN', 'B08HVL8QN3', 'B07WFPMGQQ', 'B08D6RCM3Q', 'B07GPXXNNG', 'B09NNGHG22']
