In [None]:
import pandas as pd
import numpy as np
from PIL import Image
import requests

# Explore dataset

First we need to know what categories are there in the dataset.

In [None]:
df['category'].unique()

Categories: 'Medical', 'Science-Geography', 'Art-Photography', 'Biography',
       'Business-Finance-Law', 'Childrens-Books', 'Computing',
       'Crafts-Hobbies', 'Crime-Thriller', 'Dictionaries-Languages',
       'Entertainment', 'Food-Drink', 'Graphic-Novels-Anime-Manga',
       'Health', 'History-Archaeology', 'Home-Garden', 'Humour',
       'Mind-Body-Spirit', 'Natural-History', 'Personal-Development',
       'Poetry-Drama', 'Reference', 'Religion', 'Romance',
       'Science-Fiction-Fantasy-Horror', 'Society-Social-Sciences',
       'Sport', 'Stationery', 'Teaching-Resources-Education',
       'Technology-Engineering', 'Teen-Young-Adult', 'Transport',
       'Travel-Holiday-Guides'

We wanted to find the smallest book cover size, so we can scale everything accordingly.

In [None]:
#finding min image size
min_h = 500
min_w = 500
for i in range(len(df.index)):
  url = df['image'][i]
  im = Image.open(requests.get(url, stream = True).raw)
  a = np.asarray(im)
  min_h = min(min_h, a.shape[0])
  min_w = min(min_w, a.shape[1])
print(min_h)
print(min_w)

Dropping irrelevant features so we can view the dataframe easier.

In [None]:
#drop irrelevant columns
df = df.drop(columns = ['format', 'book_depository_stars', 'price', 'currency', 'old_price', 'isbn', 'img_paths'])

Then we check to see if a book has several categories.

In [None]:
#checking if a book has several categories
df2 = df.groupby('name')['category'].apply(list).reset_index()
df2

Unnamed: 0,name,category
0,"""A Doll's House""",[Poetry-Drama]
1,"""A Raisin in the Sun""",[Poetry-Drama]
2,"""King Lear""",[Poetry-Drama]
3,"""Life of Galileo""",[Teaching-Resources-Education]
4,"""Soul Surfer: A True Story of Faith, Family, a...","[Biography, Sport]"
...,...,...
20585,the princess saves herself in this one,"[Childrens-Books, Poetry-Drama]"
20586,the witch doesn't burn in this one,"[Poetry-Drama, Society-Social-Sciences]"
20587,von Martius. The Book of Palms,"[Home-Garden, Natural-History]"
20588,wd~50,[Technology-Engineering]


There are 32581 entries but only 20590 unique books.

# Cleaning data

In [None]:
#import and drop irrelevant features
url = 'https://raw.githubusercontent.com/Shirone0110/BookCover/main/main_dataset.csv'
df = pd.read_csv(url)
df = df.drop(columns = ['format', 'book_depository_stars', 'price', 'currency', 'old_price', 'isbn', 'img_paths'])
newsize = (128, 200) #width, height

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Now we resize all images to 200 x 128 pixels and 3 color channels and save them all into a Google Drive folder with the same name format.

In [None]:
#resize images and save to google drive
im_array = []
flat = (128 * 200 * 3)
for i in range(len(df.index)):
  url = df['image'][i]
  im = Image.open(requests.get(url, stream = True).raw) #open image
  im1 = im.resize(newsize).convert('RGB')               #resize image
  arr = np.array(im1).reshape(flat)                     #turn image into array
  im_array.append(arr)
  save_path = '/content/drive/My Drive/BookCovers/image_' + str(i) + '.jpg'
  im1.save(save_path, 'JPEG')                           #save image to drive

In [None]:
df['image_arr'] = im_array

In [None]:
#create new dataset
df.to_csv('BookCover_New.csv', index = False)