In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('data/amazon_data.csv', on_bad_lines = 'skip')
df.head()

Unnamed: 0,Title,Category,Sub-Category,Price,Ratings,Total Ratings,Product URL
0,Amazon Basics Universal Travel Case Organizer ...,Electronics,Computers & Accessories,15.0,4.6,17329,/AmazonBasics-Universal-Organizer-Electronics-...
1,WILSON ELECTRONICS 971117 Cellular Booster Acc...,Electronics,Computers & Accessories,12.0,4.5,3,/ELECTRONICS-Accessory-Connector-Electronics-A...
2,A10-16 DIRECT CONNECT CABLE OR Electronics & c...,Electronics,Computers & Accessories,,5.0,2,/A10-16-CONNECT-Electronics-computer-accessori...
3,Bar Fly 4 Prime Aluminum Bicycle Accessory Mou...,Electronics,Computers & Accessories,45.0,4.4,102,/Bar-Fly-Aluminum-Computers-Magellan/dp/B01M66...
4,"Cherry Electronics M-5400, Accessory, Optical ...",Electronics,Computers & Accessories,,5.0,13,/Cherry-Electronics-M-5400-Accessory-Connector...


In [3]:
df.shape

(19387, 7)

In [4]:
df.isnull().sum()

Title               0
Category            0
Sub-Category        0
Price            3902
Ratings          4932
Total Ratings    4932
Product URL         0
dtype: int64

In [5]:
df.isnull().mean()*100

Title             0.000000
Category          0.000000
Sub-Category      0.000000
Price            20.126889
Ratings          25.439728
Total Ratings    25.439728
Product URL       0.000000
dtype: float64

In [6]:
df = df.drop_duplicates(subset = ['Title', 'Category', 'Sub-Category'])

In [7]:
df.head()

Unnamed: 0,Title,Category,Sub-Category,Price,Ratings,Total Ratings,Product URL
0,Amazon Basics Universal Travel Case Organizer ...,Electronics,Computers & Accessories,15.0,4.6,17329,/AmazonBasics-Universal-Organizer-Electronics-...
1,WILSON ELECTRONICS 971117 Cellular Booster Acc...,Electronics,Computers & Accessories,12.0,4.5,3,/ELECTRONICS-Accessory-Connector-Electronics-A...
2,A10-16 DIRECT CONNECT CABLE OR Electronics & c...,Electronics,Computers & Accessories,,5.0,2,/A10-16-CONNECT-Electronics-computer-accessori...
3,Bar Fly 4 Prime Aluminum Bicycle Accessory Mou...,Electronics,Computers & Accessories,45.0,4.4,102,/Bar-Fly-Aluminum-Computers-Magellan/dp/B01M66...
4,"Cherry Electronics M-5400, Accessory, Optical ...",Electronics,Computers & Accessories,,5.0,13,/Cherry-Electronics-M-5400-Accessory-Connector...


In [8]:
df['combined_features'] = df['Title']+ ' ' + df['Category'] + ' ' + df['Sub-Category']

In [9]:
df['combined_features']

0        Amazon Basics Universal Travel Case Organizer ...
1        WILSON ELECTRONICS 971117 Cellular Booster Acc...
2        A10-16 DIRECT CONNECT CABLE OR Electronics & c...
3        Bar Fly 4 Prime Aluminum Bicycle Accessory Mou...
4        Cherry Electronics M-5400, Accessory, Optical ...
                               ...                        
19381    Reebok Calf Compression Sleeves for Women and ...
19382    N/A Fitness Trainning Yoga Short Pants Women G...
19383    DKNY Women's Bike Color Block High Waist Short...
19384    SYROKAN High Impact Sports Bras for Women High...
19385    O2 Rainwear Original Cycling Jacket Sports & F...
Name: combined_features, Length: 16670, dtype: object

In [10]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df['combined_features'])

In [11]:
def get_recommendations(title, n=5):
    indices = pd.Series(df.index, index=df['Title']).drop_duplicates()
    idx = indices[title]
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]  
    
    product_indices = [i[0] for i in sim_scores]
    return df[['Title', 'Category', 'Sub-Category', 'Price', 'Ratings']].iloc[product_indices]

In [12]:
def filter_by_price_and_rating(recs, max_price=None, min_rating=None):
    if max_price:
        recs = recs[recs['Price'] <= max_price]
    if min_rating:
        recs = recs[recs['Ratings'] >= min_rating]
    return recs