In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np

In [None]:
df=pd.read_csv('amazon.csv')

In [None]:
df

In [None]:
df.dtypes

In [None]:
print(df.isnull().sum())

In [None]:
df[df['rating_count'].isnull()]

In [None]:
df.dropna(subset=['rating_count'],inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
df.duplicated('product_id').sum()

In [None]:
df['discounted_price'] = df['discounted_price'].astype(str).str.replace('₹', '').str.replace(',', '').astype(float)
df['actual_price'] = df['actual_price'].astype(str).str.replace('₹', '').str.replace(',', '').astype(float)
df['discount_percentage'] = df['discount_percentage'].astype(str).str.replace('%','').astype(float)/100

In [None]:
 df['rating'].str.contains('\|').sum()

In [None]:
df = df[df['rating'].apply(lambda x: '|' not in str(x))]


In [None]:
df['rating'].str.contains('\|').sum()


In [None]:
df['rating'] = df['rating'].astype(str).str.replace(',', '').astype(float)
df['rating_count'] = df['rating_count'].astype(str).str.replace(',', '').astype(float)


In [None]:
df['rating_weighted'] = df['rating'] * df['rating_count']

In [None]:
df['sub_category'] = df['category'].astype(str).str.split('|').str[-1]
df['main_category'] = df['category'].astype(str).str.split('|').str[0]

In [None]:
numeric_cols = df.select_dtypes(include=['float64', 'int64'])
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].apply(lambda x: x.strip() if isinstance(x, str) else x)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
fig, ax = plt.subplots(1,3,figsize=(20,8))
fig.suptitle('Rating & Amount of Ratings Distribution', fontweight='heavy', size='large')
sns.histplot(ax=ax[0],data=df, x='rating', bins=15, kde=True, color='green')
sns.histplot(ax=ax[1],data=df, x='rating_count', bins=15, kde=True, color='red')
sns.histplot(ax=ax[2],data=df, x='rating_weighted', bins=15, kde=True, color='blue')
ax[0].set_xlabel('Ratings')
ax[1].set_xlabel('Number of Ratings')
ax[2].set_xlabel('Weighted Ratings')

ax[0].set_ylabel('Number of Products')
ax[1].set_ylabel('Number of Products')
ax[2].set_ylabel('Number of Products')

ax[0].set_title('Distribution of Ratings', fontweight='bold')
ax[1].set_title('Distribution of Count Ratings', fontweight='bold')
ax[2].set_title('Distribution of Weighted Ratings', fontweight='bold')

plt.show()

In [None]:
fig, ax = plt.subplots(1,3,figsize=(20,8))

fig.suptitle('Rating & Amount of Ratings Distribution', fontweight='heavy', size='large')

sns.boxplot(ax=ax[0],data=df, x='rating',color='blue')
sns.boxplot(ax=ax[1],data=df, x='rating_count',  color='red')
sns.boxplot(ax=ax[2],data=df, x='rating_weighted', color='green')

ax[0].set_xlabel('Ratings')
ax[1].set_xlabel('Number of Ratings')
ax[2].set_xlabel('Weighted Ratings')

ax[0].set_ylabel('Number of Products')
ax[1].set_ylabel('Number of Products')
ax[2].set_ylabel('Number of Products')

ax[0].set_title('Distribution of Ratings', fontweight='bold')
ax[1].set_title('Distribution of Count Ratings', fontweight='bold')
ax[2].set_title('Distribution of Weighted Ratings', fontweight='bold')


In [None]:
bins = [0, 1, 2, 3, 4, 5] # Define bin edges
df['rating_bin'] = pd.cut(df['rating'], bins=bins, include_lowest=True, labels=['0-1', '1-2', '2-3', '3-4', '4-5'])
rate_bin = df['rating_bin'].value_counts().reset_index()
rate_bin = rate_bin.sort_values('count')

fig, ax = plt.subplots(figsize=(10,5))

sns.barplot(ax=ax,data=rate_bin,x='count',y='rating_bin',order=rate_bin['rating_bin'][::-1],palette='Spectral')
ax.bar_label(ax.containers[0])
ax.set_xlabel('Count')
ax.set_ylabel('Rating Bins')
ax.set_title('Number of Ratings by Bins',fontweight='bold',size=16)
plt.tight_layout()
plt.show()

In [None]:
sns.set(style='white',palette="icefire")
fig,ax = plt.subplots(figsize=(8,8))
sns.boxplot(ax=ax,data=df,x='rating',y='main_category')
ax.set_xlabel('Rating')
ax.set_ylabel('Main Categories')
ax.set_title('Rating by Main Category',fontweight='bold')
plt.show()


In [None]:
mean_top_sub = df.groupby('sub_category')['rating'].mean().sort_values(ascending=False).reset_index()[:15]
mean_top_sub['rating'] = np.round(mean_top_sub['rating'],2)
low_sub = df.groupby('sub_category')['rating'].mean().sort_values(ascending=True).reset_index()[:10]
low_sub['rating'] = np.round(low_sub['rating'],2)

fig,ax = plt.subplots(2,1,figsize=(8,8))

sns.barplot(ax=ax[0],x='rating', y='sub_category', data=mean_top_sub, palette='coolwarm')
sns.barplot(ax=ax[1],x='rating', y='sub_category', data=low_sub, palette='vlag')

ax[0].set_xlabel('Mean Rating')
ax[0].set_ylabel('Sub Category')
ax[0].set_title('Top Rated Sub Categories')
ax[0].bar_label(ax[0].containers[0])

ax[1].set_xlabel('Mean Rating')
ax[1].set_ylabel('Sub Category')
ax[1].set_title('Lowest Rated Sub Categories')
ax[1].bar_label(ax[1].containers[0])

plt.tight_layout()
plt.show()


In [None]:
mean_top = df.groupby('main_category')['rating'].mean().sort_values(ascending=False).reset_index()
mean_top['rating'] = np.round(mean_top['rating'],2)

plt.figure(figsize=(10, 6))

ax = sns.barplot(x='rating', y='main_category', data=mean_top, palette='viridis')
ax.set_xlabel('Mean Rating')
ax.set_ylabel('Main Category')
ax.set_title('Top Rated by Average Main Categories')
ax.bar_label(ax.containers[0])
plt.show()

In [None]:
pairplot = sns.pairplot(numeric_cols, kind='reg',
                        plot_kws={'line_kws':{'color':'red'}})
plt.show()


In [None]:
from sklearn.preprocessing import LabelEncoder
label_user=LabelEncoder()
label_product=LabelEncoder()
df['label_user']=label_user.fit_transform(df['user_id'])
df['label_product']=label_product.fit_transform(df['product_id'])

In [None]:
df

In [None]:
df = df.drop_duplicates(subset='label_user', keep='first')
df=df.drop_duplicates(subset='label_product', keep='first')

In [None]:
df['combined_reviews'] = df['review_title'] + " " + df['review_content']


In [None]:
df['combined_features'] = df['product_name'] + " " + df['main_category'] + " " + df['sub_category'] + " " + df['about_product']

In [None]:
from textblob import TextBlob
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from wordcloud import WordCloud

## function for sentiment analysis

In [None]:
def get_sentiment(text):
    te=TextBlob(text)
    polority=te.sentiment.polarity
    rating=round((polority +1) * 2) +1
    return max(min(rating,5),1)

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

### text preprocing

In [None]:
def preprocess_text(text):
  text=text.lower()
  text=re.sub(r'[^\w\s]','',text)
  tokens=word_tokenize(text)
  stopword=set(stopwords.words('english'))
  filtered_tokens=[token for token in tokens if token not in stopword]
  stemmer=PorterStemmer()
  stemmed_tokens=[stemmer.stem(token) for token in filtered_tokens]
  preprocessed_text=' '.join(stemmed_tokens)
  return preprocessed_text

In [None]:
df['preprocessed_combined_reviews'] = df['combined_reviews'].apply(preprocess_text)
print(df[['combined_reviews', 'preprocessed_combined_reviews']].head())


In [None]:
df['derived_rating'] = df['preprocessed_combined_reviews'].apply(get_sentiment)
print(df[['review_content', 'derived_rating']])



#colabarative filltering using svd++ or matrix factroziation method

In [None]:
!pip install surprise
from surprise import Reader, Dataset
from surprise.reader import Reader
reader = Reader(rating_scale=(1, 5))

In [None]:
data=Dataset.load_from_df(df[['label_user','label_product','derived_rating']],reader)

In [None]:
# Hyperparameter Tuning
param_grid_svdpp = {
    'n_epochs': [10, 20, 30, 40, 50],  # Increasing the number of epochs to see if longer training improves results
    'n_factors': [20, 50, 100, 200],  # Expanding the range to test smaller and larger spaces of factors
    'lr_all': [0.001, 0.003, 0.005, 0.007, 0.01],  # Adding a lower learning rate for finer adjustments
    'reg_all': [0.01, 0.02, 0.05, 0.1]  # Adjusting regularization to capture both underfitting and overfitting scenarios
}

In [None]:
from surprise.model_selection import GridSearchCV  # Import GridSearchCV
from surprise import SVDpp

In [None]:
gs_svdpp = GridSearchCV(SVDpp, param_grid_svdpp, measures=['rmse', 'mae'], cv=5)

gs_svdpp.fit(data)
print('Best SVDpp parameters:', gs_svdpp.best_params['rmse'])

In [None]:
# Train-test split
from surprise.model_selection import train_test_split
from surprise import accuracy as acc
from surprise.accuracy import rmse, mae
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
best_svdpp = gs_svdpp.best_estimator['rmse']
best_svdpp.fit(trainset)
train_predictions_svdpp = best_svdpp.test(trainset.build_testset())
print('Training Set - SVDpp RMSE:', acc.rmse(train_predictions_svdpp))
print('Training Set - SVDpp MAE:', acc.mae(train_predictions_svdpp))
test_predictions_svdpp = best_svdpp.test(testset)
print('Testing Set - SVDpp RMSE:', acc.rmse(test_predictions_svdpp))
print('Testing Set - SVDpp MAE:', acc.mae(test_predictions_svdpp))

In [None]:
metrics = {
    'SVDpp': {'RMSE': rmse(test_predictions_svdpp), 'MAE': mae(test_predictions_svdpp)}
}



In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:

user_id = 667
top_n = 5
item_ids = list(trainset.all_items())
user_id = 667
top_n = 5
item_ids = list(trainset.all_items())
user_ratings_sorted = []
for item_id in item_ids:
    prediction = best_svdpp.predict(user_id, item_id)
    user_ratings_sorted.append((item_id, prediction.est, prediction.est))
user_ratings_sorted = sorted(user_ratings_sorted, key=lambda x: x[1], reverse=True)
recommended_items = [(item_id, score) for item_id, _, score in user_ratings_sorted[:top_n]]  # Extract item ID and score

# Convert recommendations to DataFrame
collab_recom = pd.DataFrame(recommended_items, columns=['Item ID Encoded', 'Score'])

print(f"Top {top_n} Recommendations for User {user_id}:")
print("--------------------------------------------")
print(collab_recom)
recommended_items = [(item_id, score) for item_id, _, score in user_ratings_sorted[:top_n]]  # Extract item ID and score

# Convert recommendations to DataFrame
collab_recom = pd.DataFrame(recommended_items, columns=['Item ID Encoded', 'Score'])

print(f"Top {top_n} Recommendations for User {user_id}:")
print("--------------------------------------------")
print(collab_recom)

#**content based filtering using tfifdf**

> Add blockquote



In [None]:
def recommend_products(df, user_id_encoded):
    tfidf = TfidfVectorizer(stop_words='english')
    df['combined_features'] = df['combined_features'].fillna('')  # fill NaN values with empty string
    tfidf_matrix = tfidf.fit_transform(df['combined_features'])

    user_history = df[df['label_user'] == user_id_encoded]

    if not user_history.empty:
        indices = user_history.index.tolist()
        cosine_sim_user = cosine_similarity(tfidf_matrix[indices], tfidf_matrix)
        flat_cosine_sim = cosine_sim_user.flatten()
        top_indices = sorted(((i, sim) for i, sim in enumerate(flat_cosine_sim) if i not in indices), key=lambda x: x[1], reverse=True)
        top_products = top_indices[:5]
        recommended_products = df.iloc[[i[0] for i in top_products]]
        results_df = pd.DataFrame({
            'Id Encoded': [user_id_encoded] * 5,
            'Product ID': recommended_products['product_id'].tolist(),
            'Item ID Encoded': recommended_products['label_product'].tolist(),
            'Recommended Product': recommended_products['product_name'].tolist(),
            'Score Recommendation': [i[1] for i in top_products]
        })

        return results_df
    else:
        print("No purchase history found.")
        return None

In [None]:
content = recommend_products(df, 141)
content

#**content based  using bert method**

In [None]:
def recomonds_products_bert(df,user_id_encoded):
  model=SentenceTransformer('all-MiniLM-L6-v2')
  df['combined_features']=df['combined_features'].fillna('')
  sentence_embeddings=model.encode(df['combined_features'].tolist(),convert_to_tensor=True)
  user_history=df[df['label_user']==user_id_encoded]
  if not user_history.empty:
    indices=user_history.index.tolist()
    user_embeddings=sentence_embeddings[indices]
    cosine_sim_user=cosine_similarity(user_embeddings,sentence_embeddings)
    flat_cosine_sim=cosine_sim_user.flatten()
    top_indices=sorted(((i,sim) for i,sim in enumerate(flat_cosine_sim) if i not in indices),key=lambda x:x[1],reverse=True)
    top_products=top_indices[:5]
    recommended_products=df.iloc[[i[0] for i in top_products]]
    results=pd.DataFrame({
        'Id Encoded': [user_id_encoded] * 5,
        'Product ID': recommended_products['product_id'].tolist(),
        'Item ID Encoded': recommended_products['label_product'].tolist(),
        'Recommended Product': recommended_products['product_name'].tolist(),
        'Score Recommendation': [i[1] for i in top_products]})
    return results
  else:
    print("No purchase history found.")
    return None

In [None]:
content =recomonds_products_bert(df, 141) #884
content

#**Hybrid method**

In [None]:
import numpy as np
user_factors = best_svdpp.pu  # Array of user latent factors
item_factors = best_svdpp.qi  # Array of item latent factors

user_id_to_index = trainset._raw2inner_id_users
item_id_to_index = trainset._raw2inner_id_items


In [None]:
df['user_factors'] = df['label_user'].apply(lambda x: user_factors[user_id_to_index[x]] if x in user_id_to_index else np.zeros(shape=(best_svdpp.n_factors,)))
df['item_factors'] = df['label_product'].apply(lambda x: item_factors[item_id_to_index[x]] if x in item_id_to_index else np.zeros(shape=(best_svdpp.n_factors,)))

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2') # it contain 384 dimmensions

descriptions = df['combined_features'].tolist()

embeddings = model.encode(descriptions, show_progress_bar=True)

In [None]:
embeddings_df = pd.DataFrame(embeddings)

In [None]:
embeddings_df['label_product'] = df['label_product'].values
embeddings_df.columns = ['embedding_' + str(i) for i in range(embeddings.shape[1])] + ['label_product']
embeddings_df

In [None]:
embeddings_df = pd.merge(embeddings_df, df[['label_product','label_user', 'rating','user_factors','item_factors', 'product_name']], on='label_product', how='left')
embeddings_df['user_id_encoded'] = embeddings_df['label_user'].astype(int)
embeddings_df['item_id_encoded'] = embeddings_df['label_product'].astype(int)
embeddings_df.head()

In [None]:
def hybrid_system(user_id_encoded, df, top_n):
    # Find the user index
    user_indices = df[df['label_user'] == user_id_encoded].index.tolist()
    if not user_indices:
        return pd.DataFrame()  # Return an empty DataFrame if user ID is not found
    user_index = user_indices[0]

    # Extract embeddings and latent factors into numpy arrays
    embeddings = np.stack(df[[f'embedding_{i}' for i in range(384)]].values)
    item_factors = np.stack(df['item_factors'].values)
    user_factors = np.stack(df['user_factors'].values)

    # Compute similarity matrices for content-based and collaborative filtering
    item_similarity = cosine_similarity(embeddings)
    user_item_similarity = cosine_similarity(user_factors, item_factors)
    content_scores = item_similarity[user_index]
    top_content_indices = np.argsort(-content_scores)[:top_n + 1]  # +1 to possibly exclude the user's own item

    # Collaborative filtering: Top N items based on user-item interactions
    collaborative_scores = user_item_similarity[user_index]
    top_collaborative_indices = np.argsort(-collaborative_scores)[:top_n]

    # Combine and deduplicate indices
    top_indices = np.unique(np.concatenate([top_content_indices[1:], top_collaborative_indices]))[:top_n]

    # Retrieve recommended product details
    recommended_products = df.iloc[top_indices]
    recommended_products = recommended_products[['label_product', 'product_name', 'rating']]
    recommended_products['content_similarity_score'] = content_scores[top_indices]
    recommended_products['collaborative_similarity_score'] = collaborative_scores[top_indices]

    return recommended_products

In [None]:
specific_user_id = 664
recommended_products = hybrid_system(specific_user_id, embeddings_df, top_n=5)
print(recommended_products)