In [2]:
import pandas as pd

# Only read first 10,000 lines
sample = pd.read_json(
    r'C:\Users\shahe\OneDrive\Desktop\ProductRecommendationSystem\data\Electronics_5.json.gz',
    lines=True,
    chunksize=10000
)

# Convert chunk to DataFrame
df = next(sample)

print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   overall         10000 non-null  int64  
 1   vote            1649 non-null   float64
 2   verified        10000 non-null  bool   
 3   reviewTime      10000 non-null  object 
 4   reviewerID      10000 non-null  object 
 5   asin            10000 non-null  object 
 6   style           4214 non-null   object 
 7   reviewerName    9999 non-null   object 
 8   reviewText      9999 non-null   object 
 9   summary         10000 non-null  object 
 10  unixReviewTime  10000 non-null  int64  
 11  image           42 non-null     object 
dtypes: bool(1), float64(1), int64(2), object(8)
memory usage: 869.3+ KB
None
   overall  vote  verified   reviewTime      reviewerID        asin  \
0        5  67.0      True  09 18, 1999   AAP7PPBU72QFM  0151004714   
1        3   5.0      True  10 23, 2013  A

In [3]:
# Show unique users and items
print("Unique users:", df['reviewerID'].nunique())
print("Unique products:", df['asin'].nunique())

# Check average rating
print("Average rating:", df['overall'].mean())

# See how many missing values
print(df.isnull().sum())

# See most reviewed products
print(df['asin'].value_counts().head())


Unique users: 9516
Unique products: 327
Average rating: 4.2689
overall              0
vote              8351
verified             0
reviewTime           0
reviewerID           0
asin                 0
style             5786
reviewerName         1
reviewText           1
summary              0
unixReviewTime       0
image             9958
dtype: int64
asin
0972683275    479
B00000J1T1    449
B00000J1U8    448
B00000J1V5    439
6073894996    434
Name: count, dtype: int64


In [5]:
df_clean = df.dropna(subset=['reviewerID', 'asin', 'overall'])


In [6]:
ratings_matrix = df_clean.pivot_table(
    index='reviewerID',
    columns='asin',
    values='overall'
)
print(ratings_matrix.shape)
ratings_matrix.head()


(9516, 327)


asin,0101635370,0151004714,0380709473,0446697192,0511189877,0528881469,0545105668,0557348153,0594033926,0594296420,...,B00000JQR5,B00000JS2J,B00000JSES,B00000JSGF,B00000JXV1,B00000JYLO,B00000JYVT,B00000JYWQ,B00000K13I,B00000K13L
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0266076X6KPZ6CCHGVS,,,,,,,,,,,...,,,,,,,,,,
A0273990TGLE0LLF0H0B,,,,,,,,,,,...,,,,,,,,,,
A0718466DJ0Y591VLAL7,,,,,,,,,,,...,,,,,,,,,,
A1005332P0RIWL,,,,,,,,,,,...,,,,,,,,,,
A100CCTHOI884M,,,,,,,,,,,...,,,,,,,,,,


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Fill NaN with 0
item_matrix = ratings_matrix.fillna(0).T  # transpose: products x users

# Compute similarity
similarity = cosine_similarity(item_matrix)

# Put into DataFrame
similarity_df = pd.DataFrame(
    similarity,
    index=item_matrix.index,
    columns=item_matrix.index
)
similarity_df.head()


asin,0101635370,0151004714,0380709473,0446697192,0511189877,0528881469,0545105668,0557348153,0594033926,0594296420,...,B00000JQR5,B00000JS2J,B00000JSES,B00000JSGF,B00000JXV1,B00000JYLO,B00000JYVT,B00000JYWQ,B00000K13I,B00000K13L
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101635370,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151004714,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
380709473,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
446697192,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
511189877,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
product_id = '0151004714'  # replace with any product you like

# Get top 5 most similar products (excluding itself)
top_similar = similarity_df[product_id].sort_values(ascending=False)[1:6]

print("Products similar to:", product_id)
print(top_similar)


Products similar to: 0151004714
asin
B00000J1UB    0.0
B00000J1U8    0.0
B00000J1U5    0.0
B00000J1TY    0.0
B00000J1TX    0.0
Name: 0151004714, dtype: float64


## 🎉 Recommendation Results
Products similar to: `0151004714`

| ASIN        | Similarity Score |
|-------------|------------------|
| B00000J1UB  | 0.0              |
| B00000J1U8  | 0.0              |
| B00000J1U5  | 0.0              |
| B00000J1TY  | 0.0              |
| B00000J1TX  | 0.0              |

---

## 📌 Conclusion
We built a basic product recommendation system using **item-based collaborative filtering**  
on Amazon Electronics 5-core data (sample of 10,000 rows).

✅ Due to the small sample, similarity scores are mostly zero.  
📦 On the full dataset, it would provide meaningful recommendations.
