In [28]:
import pandas as pd
import chardet

# Detect the encoding of the CSV file
csv_file = 'dataset_buku.csv'
with open(csv_file, 'rb') as file:
    result = chardet.detect(file.read())

# Get the detected encoding
encoding = result['encoding']
print(encoding)

# Read the CSV file with the detected encoding
data = pd.read_csv(csv_file, encoding=encoding)

# Drop unnecessary columns
columns_to_drop = ["ISBN", "RatingDistTotal", "RatingDist3", "distance", "ratings_dist", "RatingDist1", "RatingDist2", "RatingDist4", "RatingDist5", "indices", "small_image_url"]
data.drop(columns_to_drop, axis=1, inplace=True)

# Combine PublishDay, PublishMonth, and PublishYear into a new column "Publish Date"
data["Publish Date"] = pd.to_datetime(data['PublishYear'].astype(str) + '-' + data['PublishMonth'].astype(str) + '-' + data['PublishDay'].astype(str), format='%Y-%m-%d', errors='coerce')

# Drop the original columns
data.drop(['PublishDay', 'PublishMonth', 'PublishYear'], axis=1, inplace=True)

# Display the first few rows of the updated DataFrame
data.head()

utf-8


Unnamed: 0,book_id,title,num_of_pages,Language,Authors,Publisher,CountsOfReview,Rating,image_url,Publish Date
0,1,Harry Potter and the Half-Blood Prince (Harry ...,652,eng,J.K. Rowling,Scholastic Inc.,28062,4.57,https://images.gr-assets.com/books/1361039191m...,2006-09-16
1,2,Harry Potter and the Order of the Phoenix (Har...,870,eng,J.K. Rowling,Scholastic Inc.,29770,4.5,https://images.gr-assets.com/books/1387141547m...,2004-09-01
2,5,Harry Potter and the Prisoner of Azkaban (Harr...,435,eng,J.K. Rowling,Scholastic Inc.,37093,4.57,https://images.gr-assets.com/books/1499277281m...,2004-05-01
3,10,Harry Potter Collection (Harry Potter #1-6),3342,eng,J.K. Rowling,Scholastic,809,4.73,https://images.gr-assets.com/books/1328867351m...,2005-09-12
4,13,The Ultimate Hitchhiker's Guide to the Galaxy ...,815,eng,Douglas Adams,Del Rey Books,4119,4.37,https://images.gr-assets.com/books/1404613595m...,2002-04-30


In [29]:
data.shape

(861, 10)

In [30]:
# Randomly sample 400 books
sampled_books = data.sample(n=400, random_state=42)  # Set random_state for reproducibility
print(sampled_books.shape)
sampled_books.head()

(400, 10)


Unnamed: 0,book_id,title,num_of_pages,Language,Authors,Publisher,CountsOfReview,Rating,image_url,Publish Date
714,33418,Parallel Worlds: A Journey through Creation Hi...,361,eng,Michio Kaku,Anchor,456,4.18,https://images.gr-assets.com/books/1435244003m...,2006-02-14
605,25200,Silence,201,eng,Shūsaku Endō,Taplinger Publishing Company,2125,4.08,https://images.gr-assets.com/books/1327991351m...,1999-01-01
120,4264,Fever Pitch,247,eng,Nick Hornby,Riverhead Books,1081,3.74,https://images.gr-assets.com/books/1426114203m...,1998-03-01
208,6534,Postmortem (Kay Scarpetta #1),342,eng,Patricia Daniels Cornwell,Pocket Books,2167,4.02,https://images.gr-assets.com/books/1341835831m...,2003-12-30
380,13270,Poetics,144,eng,Aristotle,Penguin Classics,491,3.83,https://images.gr-assets.com/books/1348161995m...,1996-09-26


In [31]:
sampled_books.to_csv("used_dataset_buku.csv")