In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython.display import display
from sklearn.pipeline import Pipeline

import sys
sys.path.insert(1, '../')

from src import preprocessing as pp
from src import analysis

# Enable module reloading
%load_ext autoreload
%autoreload 2

pd.set_option('display.max_columns', None)

plt.rcParams['figure.figsize'] = (10, 6)
%config InlineBackend.figure_format='retina'
plt.rcParams.update({'font.size': 15})

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
from sklearn.base import TransformerMixin

# Load datasets

In [2]:
books = pd.read_csv('../data/books.csv')
reviews = pd.read_csv('../data/reviews.csv')
# intr = pd.read_csv('../data/interactions.csv')
authors = pd.read_csv('../data/authors.csv')

# Train - val - test split


In the first step we will select only the column, which we will use in further preprocessing

In [67]:
class DropColumns(TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
        
    def fit(self, df, y=None):
        print("Drop columns: " + str(self.cols))
        return self
        
    def transform(self, df):
        return df.drop(self.cols, axis=1)

In the next step we will choose only the upper 50% of books according to number of text reviews (it means books with at least 9 text reviews.

In [15]:
books.text_reviews_count.quantile(.50)

9.0

In [10]:
books50 = books[books.text_reviews_count >= 9]

In [12]:
books50.shape

(48681, 29)

In [69]:
class SelectBooksWithNPercentile(TransformerMixin):
    def __init__(self, col_name, lower_percentile):
        self.col_name = col_name
        self.percentile = lower_percentile
        self.bound = 0
        
    def fit(self, df, y=None):
        self.bound = df[self.col_name].quantile(self.percentile)
        print("(fit) Select books with: " + self.col_name + " >= " + str(self.bound))
        return self
        
    def transform(self, df):
        return df[df[self.col_name] >= self.bound]

In [70]:
drop_cols = ['isbn', 'asin', 'kindle_asin', 'isbn13']
tags = ['favorites', 'currently-reading', 'to-read']

ppl = Pipeline([
    ('DropUnusedCols', DropColumns(drop_cols)),
    ('SelectTopNPercentileOfBooks', SelectBooksWithNPercentile('text_reviews_count', 0.5)),
    ('ExtraxtPopularShelves',pp.ExportBookShelves('popular_shelves', tags)),
])

model = ppl.fit(books)

Drop columns: ['isbn', 'asin', 'kindle_asin', 'isbn13']
(fit) Select books with: text_reviews_count >= 9.0
ExportBookShelves, tag_col: popular_shelves, tags:['favorites', 'currently-reading', 'to-read']


In [59]:
train = ppl.transform(books)

In [60]:
train.head(3)

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,similar_books,description,format,link,authors,publisher,num_pages,publication_day,isbn13,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series,favorites,currently-reading,to-read
2,698143760.0,17,['493993'],US,,"[{'count': '1799', 'name': 'fantasy'}, {'count...",,True,3.8,,"['15728807', '17182499', '15673520', '16081758...",Wanted by no one.\nHunted by everyone.\nSixtee...,ebook,https://www.goodreads.com/book/show/21401181-h...,"[{'author_id': '7314532', 'role': ''}]",Viking Children's,416.0,4.0,9780698143760.0,3.0,,2014.0,https://www.goodreads.com/book/show/21401181-h...,https://images.gr-assets.com/books/1394747643m...,21401181,33,24802827,"Half Bad (Half Life, #1)","Half Bad (Half Life, #1)",686,0,309
3,,9,['176160'],US,eng,"[{'count': '7173', 'name': 'to-read'}, {'count...",B0042JSOQC,True,4.35,B004IYJDXY,"['25861113', '7430195', '18765937', '6120544',...",It all comes down to this.\nVlad's running out...,,https://www.goodreads.com/book/show/10099492-t...,"[{'author_id': '293603', 'role': ''}]",,,,,,,,https://www.goodreads.com/book/show/10099492-t...,https://s.gr-assets.com/assets/nophoto/book/11...,10099492,152,10800440,Twelfth Grade Kills (The Chronicles of Vladimi...,Twelfth Grade Kills (The Chronicles of Vladimi...,186,195,7173
4,990662616.0,428,[],US,eng,"[{'count': '9481', 'name': 'to-read'}, {'count...",,False,3.71,B00MW0MTGE,"['20499652', '17934493', '13518102', '16210411...",The future world is at peace.\nElla Shepherd h...,Paperback,https://www.goodreads.com/book/show/22642971-t...,"[{'author_id': '4018722', 'role': ''}]",Scripturient Books,351.0,6.0,9780990662617.0,10.0,Special Edition,2014.0,https://www.goodreads.com/book/show/22642971-t...,https://images.gr-assets.com/books/1406979059m...,22642971,1525,42144295,The Body Electric,The Body Electric,49,67,9481


In [65]:
train.drop(['isbn', 'asin'], axis=1)

KeyError: "['asd'] not found in axis"