In [5]:
import csv
import re

# Input text file
input_file = "amazon-meta.txt"
output_product_file = "amazon_products.csv"
output_review_file = "amazon_reviews.csv"

# Define field headers for product and review CSVs
product_headers = ["Id", "ASIN", "Title", "Group", "SalesRank", "SimilarProducts", "Categories"]
review_headers = ["ASIN", "ReviewDate", "CustomerID", "Rating", "Votes", "Helpfulness"]

# Function to parse the Amazon metadata
def parse_amazon_meta(input_file, output_product_file, output_review_file):
    with open(input_file, 'r', encoding='utf-8') as f, \
         open(output_product_file, 'w', newline='', encoding='utf-8') as product_csv, \
         open(output_review_file, 'w', newline='', encoding='utf-8') as review_csv:

        product_writer = csv.writer(product_csv)
        review_writer = csv.writer(review_csv)

        # Write column headers
        product_writer.writerow(product_headers)
        review_writer.writerow(review_headers)

        product_data = {}
        current_asin = None  # Track the current product ASIN for reviews

        for line in f:
            line = line.strip()

            if line.startswith("Id:"):
                if product_data and "ASIN" in product_data:  # Save previous product data
                    product_writer.writerow([
                        product_data.get("Id", ""),
                        product_data.get("ASIN", ""),
                        product_data.get("Title", ""),
                        product_data.get("Group", ""),
                        product_data.get("SalesRank", ""),
                        "|".join(product_data.get("SimilarProducts", [])),
                        "|".join(product_data.get("Categories", []))
                    ])
                product_data = {"SimilarProducts": [], "Categories": []}  # Reset product data
                product_data["Id"] = line.split("Id:")[1].strip()

            elif line.startswith("ASIN:"):
                product_data["ASIN"] = line.split("ASIN:")[1].strip()
                current_asin = product_data["ASIN"]  # Store ASIN for reviews

            elif line.startswith("title:"):
                product_data["Title"] = line.split("title:")[1].strip()

            elif line.startswith("group:"):
                product_data["Group"] = line.split("group:")[1].strip()

            elif line.startswith("salesrank:"):
                product_data["SalesRank"] = line.split("salesrank:")[1].strip()

            elif line.startswith("similar:"):
                similar_products = line.split()[2:]  # Ignore first two words
                product_data["SimilarProducts"] = similar_products

            elif line.startswith("categories:"):
                categories = []
                while True:
                    next_line = f.readline().strip()
                    if not next_line.startswith("|"):
                        break
                    categories.append(next_line.replace("|", "").strip())
                product_data["Categories"] = categories

            elif line.startswith("reviews:"):
                # Skip total/downloaded count, extract reviews below
                continue  

            elif re.match(r'^\d{4}-\d{1,2}-\d{1,2}', line):  # If line starts with a date (YYYY-MM-DD)
                parts = line.split()
                review_date = parts[0]
                customer_id = parts[2]
                rating = parts[4]
                votes = parts[6]
                helpfulness = parts[8]

                # Write review entry
                if current_asin:
                    review_writer.writerow([
                        current_asin, review_date, customer_id, rating, votes, helpfulness
                    ])

            elif line.endswith("discontinued product"):  # Handle discontinued products
                product_data["Title"] = "DISCONTINUED"
                product_data["Group"] = "DISCONTINUED"
                product_data["SalesRank"] = ""
                product_data["SimilarProducts"] = []
                product_data["Categories"] = []

        # Write last product in case it's missing
        if product_data and "ASIN" in product_data:
            product_writer.writerow([
                product_data.get("Id", ""),
                product_data.get("ASIN", ""),
                product_data.get("Title", ""),
                product_data.get("Group", ""),
                product_data.get("SalesRank", ""),
                "|".join(product_data.get("SimilarProducts", [])),
                "|".join(product_data.get("Categories", []))
            ])

# Run the parser
parse_amazon_meta(input_file, output_product_file, output_review_file)

print(f"CSV files saved: {output_product_file}, {output_review_file}")


CSV files saved: amazon_products.csv, amazon_reviews.csv


In [8]:
import pandas as pd

products = pd.read_csv('amazon_products.csv')
reviews =  pd.read_csv('amazon_reviews.csv')

In [7]:
products.head()

Unnamed: 0,Id,ASIN,Title,Group,SalesRank,SimilarProducts,Categories
0,0,771044445,DISCONTINUED,DISCONTINUED,,,
1,1,827229534,Patterns of Preaching: A Sermon Sampler,Book,396585.0,0804215715|156101074X|0687023955|0687074231|08...,Books[283155]Subjects[1000]Religion & Spiritua...
2,2,738700797,Candlemas: Feast of Flames,Book,168596.0,0738700827|1567184960|1567182836|0738700525|07...,Books[283155]Subjects[1000]Religion & Spiritua...
3,3,486287785,World War II Allied Fighter Planes Trading Cards,Book,1270652.0,,Books[283155]Subjects[1000]Home & Garden[48]Cr...
4,4,842328327,Life Application Bible Commentary: 1 and 2 Tim...,Book,631289.0,0842328130|0830818138|0842330313|0842328610|08...,Books[283155]Subjects[1000]Religion & Spiritua...


In [9]:
reviews.head()

Unnamed: 0,ASIN,ReviewDate,CustomerID,Rating,Votes,Helpfulness
0,827229534,2000-7-28,A2JW67OY8U6HHK,5,10,9
1,827229534,2003-12-14,A2VE83MZF98ITY,5,6,5
2,738700797,2001-12-16,A11NCO6YTE4BTJ,5,5,4
3,738700797,2002-1-7,A9CQ3PLRNIR83,4,5,5
4,738700797,2002-1-24,A13SG9ACZ9O5IM,5,8,8


In [10]:
print("Total product_id count:", products.shape[0])
print("Unique product_id count:", products['Id'].nunique())

# Find duplicates
duplicates = products[products.duplicated(subset=['Id'], keep=False)]
print("Duplicate product_id entries:\n", duplicates)

Total product_id count: 548552
Unique product_id count: 548552
Duplicate product_id entries:
 Empty DataFrame
Columns: [Id, ASIN, Title, Group, SalesRank, SimilarProducts, Categories]
Index: []


In [11]:
print("Total ASIN count:", products.shape[0])
print("Unique ASIN count:", products['ASIN'].nunique())

# Find duplicate ASINs
duplicates = products[products.duplicated(subset=['ASIN'], keep=False)]
print("Duplicate ASIN entries:\n", duplicates)

Total ASIN count: 548552
Unique ASIN count: 548552
Duplicate ASIN entries:
 Empty DataFrame
Columns: [Id, ASIN, Title, Group, SalesRank, SimilarProducts, Categories]
Index: []
