In [13]:
import pandas as pd

In [14]:
# Load data
df = pd.read_csv('./reviews.csv', index_col='id')

In [15]:
# Data Cleaning Steps ---------------------------------------------------------

# 1. Check for empty price, OS, and color
def check_empty_values():
    empty_price = df[df['price'].isna()]
    empty_os = df[df['os'].isna()]
    empty_color = df[df['color'].isna()]
    
    print(f"Products with empty price: {len(empty_price)}")
    print(f"Products with empty OS: {len(empty_os)}")
    print(f"Products with empty color: {len(empty_color)}")

# 2. Check for multiple colors/OS per ASIN
def check_variations():
    color_counts = df.groupby('asin')['color'].nunique()
    os_counts = df.groupby('asin')['os'].nunique()
    
    multi_color = color_counts[color_counts > 1]
    multi_os = os_counts[os_counts > 1]
    
    print(f"\nASINs with multiple colors: {len(multi_color)}")
    print(f"ASINs with multiple OS: {len(multi_os)}")

# 3. Check title/features consistency
def check_consistency():
    title_counts = df.groupby('asin')['title_y'].nunique()
    feature_counts = df.groupby('asin')['features'].nunique()
    
    inconsistent_titles = title_counts[title_counts > 1]
    inconsistent_features = feature_counts[feature_counts > 1]
    
    print(f"\nASINs with inconsistent titles: {len(inconsistent_titles)}")
    print(f"ASINs with inconsistent features: {len(inconsistent_features)}")

# Run checks
print("Data Quality Checks:")
check_empty_values()
check_variations()
check_consistency()


Data Quality Checks:
Products with empty price: 1428
Products with empty OS: 15
Products with empty color: 284

ASINs with multiple colors: 0
ASINs with multiple OS: 0

ASINs with inconsistent titles: 0
ASINs with inconsistent features: 0


In [16]:
# Data Processing -------------------------------------------------------------

# Calculate metrics
asin_avg_rating = df.groupby('asin')['rating'].mean().reset_index()
asin_avg_rating.rename(columns={'rating': 'avg_rating'}, inplace=True)

asin_review_count = df.groupby('asin').size().reset_index(name='num_reviews')
asin_avg_price = df.groupby('asin')['price'].mean().reset_index()  # Changed to mean()

# Create summary table
asin_summary = pd.merge(asin_avg_rating, asin_review_count, on='asin')
asin_summary = pd.merge(asin_summary, asin_avg_price, on='asin')

# Add product attributes (using first occurrence)
product_attributes = df.groupby('asin')[['title_y', 'features', 'os', 'color']].first().reset_index()
asin_summary = pd.merge(asin_summary, product_attributes, on='asin', how='left')

# Clean original dataset
columns_to_remove = [
    'brand', 'user_id', 'main_category', 'store', 'categories',
    'bought_together', 'subtitle', 'author', 'num_reviews',
    'average_rating', 'rating_number', 'avg_helpful_votes', 'os', 'color'
]
df_clean = df.drop(columns=columns_to_remove)

In [17]:
df_clean.head()

Unnamed: 0_level_0,rating,title_x,text,asin,timestamp,helpful_vote,title_y,features,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,5,Fast!,I have been using laptops for the past 30 year...,B089HR6CQP,1601470000000.0,0,"FusionTech Gaming G3 15 3500, 15.6 inch FHD La...",[15.6 inch FHD (1920 x 1080) 120Hz 250 nits WV...,799.0
1,5,Works Great - No Problems - Great Value for Price,"I read a bunch of negative reviews, so I wante...",B089HR6CQP,1626710000000.0,0,"FusionTech Gaming G3 15 3500, 15.6 inch FHD La...",[15.6 inch FHD (1920 x 1080) 120Hz 250 nits WV...,799.0
2,5,Excellent gaming computer for the price,"Runs smooth, fast, and works well for gaming. ...",B089HR6CQP,1614820000000.0,0,"FusionTech Gaming G3 15 3500, 15.6 inch FHD La...",[15.6 inch FHD (1920 x 1080) 120Hz 250 nits WV...,799.0
3,1,Overheating on setup,I'm impressed. I've never seen a computer so w...,B089HR6CQP,1626820000000.0,0,"FusionTech Gaming G3 15 3500, 15.6 inch FHD La...",[15.6 inch FHD (1920 x 1080) 120Hz 250 nits WV...,799.0
4,1,Don’t buy,"I bought 2 of these laptops, asked for replace...",B089HR6CQP,1611410000000.0,22,"FusionTech Gaming G3 15 3500, 15.6 inch FHD La...",[15.6 inch FHD (1920 x 1080) 120Hz 250 nits WV...,799.0


In [20]:
asin_summary

Unnamed: 0,asin,avg_rating,num_reviews,price,title_y,features,os,color
0,B004PANKIA,5.000000,1,,FusionTech ABC 15 Laptop - High-Octane Enterta...,"[Genuine Windows 7 Home Premium, 64bit, Intel ...",Windows 7;,Silver
1,B0052F35I2,3.750000,12,,"FusionTech ABC 17 Laptop, i7-2630QM, 8GB DDR3 ...","[Intel Core i7 Processor 2GHz, 6GB DDR3 RAM, 6...",Windows 7,
2,B0057CAGUA,5.000000,1,,FusionTech Worktop 17R i17R Core i5-2430M 2.4G...,[],Microsoft Windows 7 Home Premium (64 bit),Red
3,B005SDDXF8,4.333333,6,,"FusionTech ABC 15 L502X Intel Core i5-2410, 2....",[2nd generation Intel Core i5-2 410M processor...,Windows 7 Home Premium (64-bit),
4,B0081YPX3Q,2.642857,14,,FusionTech ABC15-9375sLV 15-Inch Laptop (2.1 G...,"[Intel Core i7 3612QM Processor 2.1GHz, 8 GB D...",Windows 7,Silver
...,...,...,...,...,...,...,...,...
192,B0B8C1N39G,4.500000,2,,FusionTech Newest G15 15.6 Inch FHD 120Hz LED ...,[Most Powerful and Fast AMD Octa-Core Ryzen 7 ...,Windows 11,
193,B0BG6BD5PZ,2.000000,1,1499.00,FusionTech Sharp 5560 Workstation Laptop PC FH...,[✨【 PROCESSOR 】Intel 11th Generation Core i7-1...,Windows 10 Pro,Silver
194,B0BLJKNRC9,5.000000,1,,"2020 FusionTech ABC 9700 Laptop 17"" - Intel Co...",[[Intel Core i7 Processor] Released in 2020 Th...,Windows 10 Pro,Platinum Silver
195,B0BQ9J99BF,1.000000,1,1089.99,"FusionTech Mercury 7620 Business Laptop, 16"" F...",[【High Speed RAM And Enormous Space】24GB DDR5 ...,Windows 11 Pro,Black


In [None]:
# Export files -----------------------------------------------------------------
asin_summary.to_csv('../rawdata/asin_summary.csv', index=False)
df_clean.to_csv('../rawdata/cleaned_reviews.csv', index=False)

print("\nProcessing complete:")
print(f"- ASIN summary saved to asin_summary.csv ({len(asin_summary)} products)")
print(f"- Cleaned reviews saved to cleaned_reviews.csv ({len(df_clean)} rows)")


Processing complete:
- ASIN summary saved to asin_summary.csv (197 products)
- Cleaned reviews saved to cleaned_reviews.csv (4132 rows)
