In [None]:
# imports
import datetime

import pandas as pd

from pathlib import Path

In [None]:
# loading data
path = Path('../../data/raw_data/google_books_dataset.csv')
data = pd.read_csv(path, index_col=0)

# showing first 5 rows
data.head()

In [None]:
# basic information about data
data.info()

In [None]:
# drop column with lots of NaN values
data = data.drop(columns='averageRating')

# formating published date and drop if data have future published date
data['publishedDate'] = pd.to_datetime(data['publishedDate'], errors='coerce', format='%Y-%m-%d')
data = data[data['publishedDate'].dt.date <= datetime.datetime.now().date()]

# drop NaN values
data = data.dropna()

# drop ducplicated rows
data = data.drop_duplicates()

# sort and show first 5 rows to check publishedDate (if the date is not too old)
data = data.sort_values(by='publishedDate', ascending=True).reset_index(drop=True)
data.head()

In [None]:
# basic information about rest of the data
data.info()

In [None]:
# checking not unique title values
counts = data['title'].value_counts()
not_unique_data = data[data['title'].isin(counts[counts > 1].index)]

not_unique_data

In [None]:
# don't know which book is better to keep in dataset => drop all 4 books
data = data.drop_duplicates(subset=['title'], keep=False).reset_index(drop=True)

In [None]:
# basic information about rest of the data
data.info()

In [None]:
# saving final dataset
output_path = Path('../../data/clean_data/books.csv')
output_path.parent.mkdir(exist_ok=True, parents=True)
data.to_csv(output_path, index=False)