In [1]:
import pandas as pd
import os

cwd = os.getcwd()

os.chdir(cwd+'/input csv files')
f1 = open("ratings.csv", 'r')
f2 = open("books.csv", 'r')
f3 = open("book_data_from_kaggle.csv", 'r')
f4 = open("tags.csv", 'r')
f5 = open("book_tags.csv", 'r')

ratings = pd.read_csv(f1)
books = pd.read_csv(f2)
books2 = pd.read_csv(f3)
tags = pd.read_csv(f4)
book_tags = pd.read_csv(f5)

f1.close()
f2.close()
f3.close()
f4.close()
f5.close()

os.chdir(cwd)

In [2]:
# step1: get unique users from ratings.csv (take top 15000 users only)
# agg = aggregate, pd.Series.count on attribute rating count no.of ratings given by that user
useronly = ratings.groupby(by='user_id', as_index=False).agg({'rating': pd.Series.count}).sort_values('rating', ascending=False).head(15000)
print('No.of different users in dataset: '+str(len(useronly)))

No.of different users in dataset: 15000


In [3]:
# step2: remove data related to remaining users from ratings.csv
ratings = ratings[ratings.user_id.isin(useronly.user_id)]

In [4]:
# step3: get unique books from ratings.csv (take top 8000 books only)
bookonly = ratings.groupby(by='book_id', as_index=False).agg({'rating': pd.Series.count}).sort_values('rating', ascending=False).head(8000)
print('No.of different books in dataset: '+str(len(bookonly)))

No.of different books in dataset: 8000


In [5]:
# step4: remove remaining books from books.csv and remaining details from book_tags.csv and ratings.csv
books = books[books.book_id.isin(bookonly.book_id)]
ratings = ratings[ratings.book_id.isin(bookonly.book_id)]
book_tags = book_tags.merge(books[['book_id', 'goodreads_book_id']], how='left', on='goodreads_book_id')
book_tags = book_tags[book_tags.book_id.isin(bookonly.book_id)]
book_tags = book_tags.drop(['book_id'], axis=1)

In [6]:
# step5: add a new column in books.csv ('newbookid') to make all indices continuous because some book_id's are deleted
books = books.reset_index(drop=True)
books['newbookid'] = books.index+1

In [7]:
# step6: add this newbookid column to ratings.csv
ratings = ratings.merge(books[['book_id', 'newbookid']], how='left', on='book_id')

In [8]:
# step7: add a new column in ratings.csv ('newuser_id') to make all user_id's continuous because some user_id's are deleted
ratings['newuser_id'] = ratings.groupby('user_id').grouper.group_info[0]+1

In [9]:
# step8: add column ('genre' and 'firstgenre') using tags.csv and book_tags.csv
book_tags = book_tags.merge(tags[['tag_id', 'tag_name']], how='left', on='tag_id')
book_tags['tag_name'] = book_tags[['goodreads_book_id', 'tag_name']].groupby(['goodreads_book_id'])['tag_name'].transform(lambda x: '|'.join(x))
book_tags = book_tags.drop(['tag_id', 'count'], axis=1)
book_tags = book_tags[['goodreads_book_id', 'tag_name']].drop_duplicates()
books = books.merge(book_tags[['goodreads_book_id', 'tag_name']], how='left', on='goodreads_book_id')
books = books.rename(columns={'tag_name': 'tag_cloud'})
books['tag_cloud'] = books['tag_cloud'].fillna('Unknown')
books['mosttagged'] = [x.split('|')[0] for x in books['tag_cloud']]

In [10]:
# step9: drop columns ('user_id', 'book_id) from ratings.csv
ratings = ratings.drop(['user_id', 'book_id'], axis=1)

In [11]:
print(ratings['rating'].value_counts())

4    757468
5    659929
3    515113
2    134803
1     45375
Name: rating, dtype: int64


In [12]:
# step10: get title,pages,decs,rating columns from book_data_from_kaggle.csv to a temporary df
temp = books2[['book_title', 'book_pages', 'book_desc', 'book_rating']]

In [13]:
# step11: change pattern of string in title of books
#books['title'] = books['title'].str.replace(r"\(.*\)", "")
#books['original_title'] = books['original_title'].str.replace(r"\(.*\)", "")

In [14]:
# step12: merge books and temp (temp's columns will be added twice with _x and_y)
books = books.merge(temp.drop_duplicates(['book_title']), how='left', left_on=['original_title'], right_on=['book_title'])
books = books.merge(temp.drop_duplicates(['book_title']), how='left', left_on=['title'], right_on=['book_title'])

In [15]:
# step13: add columns (snippet, pages, first_author) using columns from temp and drop all columns from temp
books['snippet'] = books['book_desc_y'].fillna(books['book_desc_x'])
books['pages'] = books['book_pages_y'].fillna(books['book_pages_x'])
books = books.drop(['book_desc_x', 'book_desc_y', 'book_pages_y', 'book_pages_x', 'book_rating_x', 'book_rating_y', 'book_title_y', 'book_title_x'], axis=1)
books['pages'] = books['pages'].fillna('0')
books['first_author'] = [x.split(',')[0] for x in books['authors']]
books['first_author'] = books['first_author'].fillna('Unknown')

In [16]:
# write finalbooks and finalratings csv files
os.chdir(cwd+'/output csv files')
books.to_csv("finalbooks.csv", index=False)
ratings.to_csv("finalratings.csv", index=False)
os.chdir(cwd)

In [17]:
books.head(20)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_4,ratings_5,image_url,small_image_url,newbookid,tag_cloud,mosttagged,snippet,pages,first_author
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,1,favorites|currently-reading|young-adult|fictio...,favorites,Winning will make you famous. Losing means cer...,374 pages,Suzanne Collins
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,2,to-read|favorites|fantasy|currently-reading|yo...,to-read,,0,J.K. Rowling
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,3,young-adult|fantasy|favorites|vampires|ya|fict...,young-adult,About three things I was absolutely positive.F...,498 pages,Stephenie Meyer
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,4,classics|favorites|to-read|classic|historical-...,classics,The unforgettable novel of a childhood in a sl...,324 pages,Harper Lee
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,5,classics|favorites|fiction|classic|books-i-own...,classics,Alternate Cover Edition ISBN: 0743273567 (ISBN...,180 pages,F. Scott Fitzgerald
5,6,11870085,11870085,16827462,226,525478817,9780525000000.0,John Green,2012.0,The Fault in Our Stars,...,698471,1311871,https://images.gr-assets.com/books/1360206420m...,https://images.gr-assets.com/books/1360206420s...,6,favorites|to-read|young-adult|fiction|ya|roman...,favorites,Despite the tumor-shrinking medical miracle th...,313 pages,John Green
6,7,5907,5907,1540236,969,618260307,9780618000000.0,J.R.R. Tolkien,1937.0,The Hobbit or There and Back Again,...,665635,1119718,https://images.gr-assets.com/books/1372847500m...,https://images.gr-assets.com/books/1372847500s...,7,fantasy|favorites|classics|to-read|fiction|boo...,fantasy,In a hole in the ground there lived a hobbit. ...,366 pages,J.R.R. Tolkien
7,8,5107,5107,3036731,360,316769177,9780317000000.0,J.D. Salinger,1951.0,The Catcher in the Rye,...,661516,709176,https://images.gr-assets.com/books/1398034300m...,https://images.gr-assets.com/books/1398034300s...,8,classics|favorites|fiction|to-read|classic|you...,classics,The hero-narrator of The Catcher in the Rye is...,277 pages,J.D. Salinger
8,9,960,960,3338963,311,1416524797,9781417000000.0,Dan Brown,2000.0,Angels & Demons,...,716569,680175,https://images.gr-assets.com/books/1303390735m...,https://images.gr-assets.com/books/1303390735s...,9,to-read|fiction|mystery|favorites|thriller|dan...,to-read,,0,Dan Brown
9,10,1885,1885,3060926,3455,679783261,9780680000000.0,Jane Austen,1813.0,Pride and Prejudice,...,609755,1155673,https://images.gr-assets.com/books/1320399351m...,https://images.gr-assets.com/books/1320399351s...,10,classics|favorites|fiction|romance|classic|boo...,classics,«È cosa ormai risaputa che a uno scapolo in po...,279 pages,Jane Austen


In [18]:
ratings.head(20)

Unnamed: 0,rating,newbookid,newuser_id
0,4,70,1
1,3,264,1
2,4,388,1
3,5,18,1
4,5,27,1
5,5,21,1
6,5,2,1
7,5,23,1
8,5,24,1
9,4,964,1


In [19]:
from sklearn.model_selection import train_test_split

# split given dataset into train and test dataset
train, test = train_test_split(ratings, test_size=0.2)

# write train and test csv files
os.chdir(cwd+'/output csv files')
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)
os.chdir(cwd)

In [20]:
# total no.of ratings we have for all books
print('We have a total of '+str(len(ratings))+' ratings (score from '+str(ratings['rating'].min())+' to '+str(ratings['rating'].max())+') in our datset')

# total no.of books and users we have
print('We have a total of '+str(len(useronly))+' users in our dataset')
print('We have a total of '+str(len(bookonly))+' books in our dataset')

# average no.of ratings per a book in the dataset
ratings_count_per_book = ratings.groupby(by='newbookid', as_index=False).agg({'rating':pd.Series.count})
print('On an average we have '+str(ratings_count_per_book['rating'].mean())+' ratings per book (from min '+str(ratings_count_per_book['rating'].min())+' to max '+str(ratings_count_per_book['rating'].max())+') in our datset')

# average no.of ratings per a user in the dataset
ratings_count_per_user = ratings.groupby(by='newuser_id', as_index=False).agg({'rating':pd.Series.count})
print('On an average we have '+str(ratings_count_per_user['rating'].mean())+' ratings per user (from min '+str(ratings_count_per_user['rating'].min())+' to max '+str(ratings_count_per_user['rating'].max())+') in our dataset')

# average no.of ratings per user in train dataset
ratings_count_per_user_in_train_dataset = train.groupby(by='newuser_id', as_index=False).agg({'rating':pd.Series.count})
print('In train dataset on an average we have '+str(ratings_count_per_user_in_train_dataset['rating'].mean())+' ratings per user (from min '+str(ratings_count_per_user_in_train_dataset['rating'].min())+' to max '+str(ratings_count_per_user_in_train_dataset['rating'].max())+')')

# average no.of ratings per user in test dataset
ratings_count_per_user_in_test_dataset = test.groupby(by='newuser_id', as_index=False).agg({'rating':pd.Series.count})
print('In test dataset on an average we have '+str(ratings_count_per_user_in_test_dataset['rating'].mean())+' ratings per user (from min '+str(ratings_count_per_user_in_test_dataset['rating'].min())+' to max '+str(ratings_count_per_user_in_test_dataset['rating'].max())+')')

We have a total of 2112688 ratings (score from 1 to 5) in our datset
We have a total of 15000 users in our dataset
We have a total of 8000 books in our dataset
On an average we have 264.086 ratings per book (from min 31 to max 7280) in our datset
On an average we have 140.84586666666667 ratings per user (from min 62 to max 200) in our dataset
In train dataset on an average we have 112.67666666666666 ratings per user (from min 52 to max 166)
In test dataset on an average we have 28.1692 ratings per user (from min 8 to max 56)


In [21]:
# delete all dataframes used
del ratings, books, books2, book_tags, tags, temp