In [1]:
import pandas as pd


In [13]:
# Load raw books data
books = pd.read_csv('../data/raw/books.csv')

# Drop duplicates by book_id
books = books.drop_duplicates(subset='book_id')


In [5]:
# Drop rows with missing title or author
books = books.dropna(subset=['title', 'authors'])

# Keep only useful columns
books_cleaned = books[[
    'book_id',
    'title',
    'authors',
    'average_rating',
    'ratings_count',
    'original_publication_year',
    'language_code'
]]

In [7]:
# Drop missing language/year
books_cleaned = books_cleaned.dropna(subset=['language_code', 'original_publication_year'])

# Save cleaned version
books_cleaned.to_csv('../data/processed/01_cleaned_books.csv', index=False)

# Quick preview
books_cleaned.head()


Unnamed: 0,book_id,title,authors,average_rating,ratings_count,original_publication_year,language_code
0,1,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,4.34,4780653,2008.0,eng
1,2,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré",4.44,4602479,1997.0,eng
2,3,"Twilight (Twilight, #1)",Stephenie Meyer,3.57,3866839,2005.0,en-US
3,4,To Kill a Mockingbird,Harper Lee,4.25,3198671,1960.0,eng
4,5,The Great Gatsby,F. Scott Fitzgerald,3.89,2683664,1925.0,eng


In [15]:
# Load raw books data
ratings = pd.read_csv('../data/raw/ratings.csv')

# Drop missing values if any
ratings_cleaned = ratings.dropna()

# Remove duplicates
ratings_cleaned = ratings_cleaned.drop_duplicates()

# Optional: check number of users/books
num_users = ratings_cleaned['user_id'].nunique()
num_books = ratings_cleaned['book_id'].nunique()
print(f"Unique users: {num_users}")
print(f"Unique books rated: {num_books}")

# Save cleaned ratings
ratings_cleaned.to_csv('../data/processed/02_cleaned_ratings.csv', index=False)

# Preview
ratings_cleaned.head()


Unique users: 53424
Unique books rated: 10000


Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [17]:
# Load raw tags data
tags = pd.read_csv('../data/raw/tags.csv')

# Drop duplicates
tags_cleaned = tags.drop_duplicates()

# Drop rows with empty tag names
tags_cleaned = tags_cleaned.dropna(subset=['tag_name'])

# Save cleaned version
tags_cleaned.to_csv('../data/processed/03_cleaned_tags.csv', index=False)

# Preview
tags_cleaned.head()


Unnamed: 0,tag_id,tag_name
0,0,-
1,1,--1-
2,2,--10-
3,3,--12-
4,4,--122-


In [19]:
# Load raw book_tags data
book_tags = pd.read_csv('../data/raw/book_tags.csv')

# Drop duplicates
book_tags_cleaned = book_tags.drop_duplicates()

# Save cleaned version
book_tags_cleaned.to_csv('../data/processed/04_cleaned_book_tags.csv', index=False)

# Preview
book_tags_cleaned.head()


Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716
