In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from files.sb_utils import save_file

## Import csv files: Books, Users and Ratings
#### and look at dtypes

In [2]:
books = pd.read_csv(r"C:\Users\sydne\Desktop\Springboard\BIS Cap\archive\Books.csv", dtype ={'Year-Of-Publication': object})
books.dtypes

ISBN                   object
Book-Title             object
Book-Author            object
Year-Of-Publication    object
Publisher              object
Image-URL-S            object
Image-URL-M            object
Image-URL-L            object
dtype: object

In [3]:
users = pd.read_csv(r'C:\Users\sydne\Desktop\Springboard\BIS Cap\archive\Users.csv')
users.dtypes

User-ID       int64
Location     object
Age         float64
dtype: object

In [4]:
ratings = pd.read_csv(r'C:\Users\sydne\Desktop\Springboard\BIS Cap\archive\Ratings.csv')
ratings.dtypes

User-ID         int64
ISBN           object
Book-Rating     int64
dtype: object

### Get shape of Dataframes

In [5]:
books.shape

(271360, 8)

In [6]:
users.shape

(278858, 3)

In [7]:
ratings.shape

(1149780, 3)

## Delete Image URLs from book dataframe

In [8]:
books= books.drop(columns=['Image-URL-S','Image-URL-M', 'Image-URL-L'])

In [9]:
books.head(5)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


## Merge Users and ratings via the User_ID column

In [10]:
user_ratings= users.merge(ratings, left_on = 'User-ID', right_on= 'User-ID')
user_ratings.head()
#ratings are on a scale from 0 to 10

Unnamed: 0,User-ID,Location,Age,ISBN,Book-Rating
0,2,"stockton, california, usa",18.0,195153448,0
1,7,"washington, dc, usa",,34542252,0
2,8,"timmins, ontario, canada",,2005018,5
3,8,"timmins, ontario, canada",,60973129,0
4,8,"timmins, ontario, canada",,374157065,0


In [11]:
# get shaape of new DF
user_ratings.shape

(1149780, 5)

## Merge user_rating with books on the ISBN column

In [12]:
user_rating_book = user_ratings.merge(books, left_on = 'ISBN', right_on='ISBN')

In [13]:
#look at the columns
user_rating_book.columns

Index(['User-ID', 'Location', 'Age', 'ISBN', 'Book-Rating', 'Book-Title',
       'Book-Author', 'Year-Of-Publication', 'Publisher'],
      dtype='object')

In [14]:
# look at the new dataframe using head()
user_rating_book.head()

Unnamed: 0,User-ID,Location,Age,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,2,"stockton, california, usa",18.0,195153448,0,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,8,"timmins, ontario, canada",,2005018,5,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,11400,"ottawa, ontario, canada",49.0,2005018,0,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
3,11676,"n/a, n/a, n/a",,2005018,8,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
4,41385,"sudbury, ontario, canada",,2005018,0,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada


In [15]:
#look at shape
user_rating_book.shape

(1031136, 9)

## Check null values

In [16]:
missing = pd.concat([user_rating_book.isnull().sum(), 100 * user_rating_book.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by=['count'], ascending = True)

Unnamed: 0,count,%
User-ID,0,0.0
Location,0,0.0
ISBN,0,0.0
Book-Rating,0,0.0
Book-Title,0,0.0
Year-Of-Publication,0,0.0
Book-Author,2,0.000194
Publisher,2,0.000194
Age,277835,26.944554


#### Drop the age column

In [17]:
user_rating_book = user_rating_book.drop(columns=['Age'])

In [18]:
missing = pd.concat([user_rating_book.isnull().sum(), 100 * user_rating_book.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by=['count'], ascending = True)

Unnamed: 0,count,%
User-ID,0,0.0
Location,0,0.0
ISBN,0,0.0
Book-Rating,0,0.0
Book-Title,0,0.0
Year-Of-Publication,0,0.0
Book-Author,2,0.000194
Publisher,2,0.000194


## Check unique values

In [19]:
user_rating_book.nunique()

User-ID                 92106
Location                22480
ISBN                   270151
Book-Rating                11
Book-Title             241071
Book-Author            101587
Year-Of-Publication       118
Publisher               16729
dtype: int64

## Get random samples of 100K

In [20]:
sampled =user_rating_book.sample(100000)
sampled

Unnamed: 0,User-ID,Location,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
850960,164533,"sitges, barcelona, spain",9500285118,0,Autobiografia / Borges,Jorge Luis Borges,1999,El Ateneo
367610,72238,"durham, maine, usa",055329170X,0,Time Bomb (Alex Delaware Novels (Paperback)),Jonathan Kellerman,1991,Bantam Books
1021176,258807,"victoria, british columbia, canada",0789464918,0,John Paul II: Chronicle of a Remarkable Life,Catherine Legrand,2000,DK Publishing Inc
170700,204864,"simi valley, california, usa",0688031188,0,The Frugal Gourmet,Jeff Smith,1984,Harpercollins
78548,208085,"roanoke, virginia, usa",0345443284,9,While I Was Gone,Sue Miller,1999,Ballantine Books
...,...,...,...,...,...,...,...,...
774631,61874,"kirtland, new mexico, usa",0671416340,9,SPELLBINDER,Harold Robbins,1982,Simon &amp; Schuster
138251,238557,"kuala lumpur, selangor, malaysia",0345351525,0,The Queen of the Damned (Vampire Chronicles (P...,Anne Rice,1993,Ballantine Books
165204,46925,"essen, nordrhein-westfalen, germany",3596122279,0,Das Superwieb,Hera Lind,2002,Distribooks Inc
925243,221124,"newburgh, indiana, usa",0374123543,0,Churchill: A Biography,Roy Jenkins,2001,Farrar Straus Giroux


## Get a brief look at the Samples

In [21]:
sampled_count = sampled.groupby(['User-ID', 'Book-Rating', 'ISBN'])['User-ID'].count()
sampled_count

User-ID  Book-Rating  ISBN      
8        0            1558746218    1
17       7            0425099148    1
44       0            0842342702    1
67       0            0345260317    1
75       0            0140067477    1
                                   ..
278843   9            0399146431    1
278849   0            0920656307    1
278851   0            0553277375    1
278852   8            0449907597    1
278854   0            0553578596    1
Name: User-ID, Length: 100000, dtype: int64

## save new csv(s) 

In [22]:
datapath = '../Data'
save_file(sampled, 'sampled_df.csv', datapath)

A file already exists with this name.

Writing file.  "../Data\sampled_df.csv"


In [23]:
save_file(user_rating_book, 'user_rating_book.csv', datapath)

A file already exists with this name.

Writing file.  "../Data\user_rating_book.csv"
