# How I approached feature engineering

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from ast import literal_eval
from collections import Counter, defaultdict
import json

## Loading data
After collecting data using Goodreads API, load the json file and turn it into a pandas dataframe.

In [2]:
with open('goodreads_data2') as f:
    data2 = [json.loads(line) for line in f]
    
with open('goodreads_data3') as f:
    data3 = [json.loads(line) for line in f]

with open('goodreads_data4') as f:
    data4 = [json.loads(line) for line in f]
    
with open('goodreads_data5') as f:
    data5 = [json.loads(line) for line in f]

with open('goodreads_data6') as f:
    data6 = [json.loads(line) for line in f]

df = pd.DataFrame(data2+data3+data4+data5+data6)
df = df.drop_duplicates(subset=['author','title'],keep='first')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20670 entries, 0 to 22257
Data columns (total 19 columns):
author            20670 non-null object
average_rating    20670 non-null object
birth_date        11350 non-null object
book_id           20670 non-null object
death             5730 non-null object
description       20670 non-null object
gender            17488 non-null object
hometown          14856 non-null object
image_url         20670 non-null object
is_series         9324 non-null object
isbn              19971 non-null object
month             16659 non-null object
num_works         20670 non-null object
pages             17993 non-null object
rating_dist       20670 non-null object
tags              20670 non-null object
title             20670 non-null object
widget            20670 non-null object
year              18492 non-null object
dtypes: object(19)
memory usage: 3.2+ MB


Taking a look at df.info(), I notice that the datatypes for numerical features such as num_works is object, not int or float. Therefore, I converted the datatypes for the following:

In [None]:
#Convert data type into integers
df['average_rating'] = df['average_rating'].astype(float)
df['num_works'] = df['num_works'].astype(float)
df['pages'] = df['pages'].astype(float)
df['month'] = df['month'].astype(float)
df['year'] = df['year'].astype(float)

In [149]:
df.columns = ['author', 'avg_rating', 'birth_date', 'book_id', 'death_date',
       'description', 'gender', 'hometown', 'image_url', 'is_series', 'isbn',
       'month', 'num_works', 'pages', 'rating_dist', 'tags', 'title', 'widget',
       'year']

Import list of books with cinematic adaptations and make a new column called "has_movie".

In [150]:
with open('booksmovies_list.txt',encoding="utf8") as f:
    allbooksmovies = f.read().splitlines()

In [151]:
movies = []
lst = list(df.title.values)
for book in lst:
    if book.split('(')[0].strip() in allbooksmovies:
        movies.append(1)
    else:
        movies.append(0)

In [153]:
df['has_movie'] = np.array(movies)
df.has_movie.value_counts() #Take a look at movie counts

0    17318
1     1557
Name: has_movie, dtype: int64

Let's check how many tags there are.

In [13]:
tags = []
for tag in list(df.tags.values):
    tags.append(tag.replace('[','').replace(']','').split(', '))

In [14]:
all_tags = list(itertools.chain.from_iterable(tags)) #join list of lists
counts = Counter(all_tags) #count frequency of tags

In [16]:
len(counts)

127839

With so many unique tags, I need to focus on the most common and best tags through pandas's get_dummies.

In [17]:
mod_tag = []
top_keys = set(dict(counts.most_common(300)).keys()) #find most common tags
for tag in tags:
    mod_tag.append([x for x in tag if x in top_keys])

In [18]:
genres = pd.DataFrame(np.array(mod_tag))
genres.columns = ['mod_tags']

In [19]:
mod_data = pd.merge(df,genres,left_index=True,right_index=True)
s = pd.Series(mod_data['mod_tags'])
all_books = pd.concat([mod_data.reset_index(),pd.get_dummies(s.apply(pd.Series).stack()).sum(level=0)],axis=1)

In [20]:
all_books.head(2)

Unnamed: 0,index,author,avg_rating,birth_date,book_id,death_date,description,gender,hometown,image_url,...,wish-list,women,ya,ya-books,ya-fantasy,ya-fiction,ya-lit,young-adult,young-adult-fiction,youth
0,0,Suzanne Collins,4.34,1962/08/11,2767052,,<b>Winning will make you famous. <br />Losing ...,female,"Hartford, Connecticut",https://images.gr-assets.com/books/1447303603m...,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,1,J.K. Rowling,4.45,1965/07/31,3,,Harry Potter's life is miserable. His parents ...,female,"Yate, South Gloucestershire, England",https://images.gr-assets.com/books/1474154022m...,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0


In [21]:
%pprint
list(all_books.columns)

Pretty printing has been turned OFF


['index', 'author', 'avg_rating', 'birth_date', 'book_id', 'death_date', 'description', 'gender', 'hometown', 'image_url', 'is_series', 'isbn', 'month', 'num_works', 'pages', 'rating_dist', 'tags', 'title', 'widget', 'year', 'has_movie', 'mod_tags', '1', '1001', '1001-books', '1001-books-to-read-before-you-die', '19th-century', '2006', '2013-reads', '2014-reads', '2015-reads', '2016-reads', '2017-reads', '20th-century', '21st-century', '3-stars', '4-stars', '5-stars', 'abandoned', 'action', 'action-adventure', 'adult', 'adult-fiction', 'adventure', 'all-time-favorites', 'america', 'american', 'american-lit', 'american-literature', 'animals', 'audible', 'audio', 'audio-book', 'audio-books', 'audiobook', 'audiobooks', 'autobiography', 'biographies', 'biography', 'biography-memoir', 'book-club', 'book-club-books', 'book-group', 'bookclub', 'books', 'books-i-have', 'books-i-own', 'books-owned', 'bookshelf', 'borrowed', 'bought', 'british', 'british-literature', 'calibre', 'chick-lit', 'chi

As shown above, we see repetive tags such as "ebook" and "e-books". I combined those tags into one and added them as columns in my dataframe.

In [22]:
data = all_books.drop(['index','book_id','author','birth_date','widget','isbn','hometown','image_url','tags','mod_tags'],axis=1)

data['has_audiobook'] = data['audible'] + data['audio'] + data['audio-book'] + data['audio-books'] + data['audiobook'] + data['audiobooks']
data['has_audiobook'] = data['has_audiobook'].apply(lambda x: 1 if x > 0 else 0)

data['young_adult'] = data['ya'] + data['ya-books'] + data['ya-fiction'] + data['ya-fantasy'] + data['young-adult'] + data['young-adult-fiction'] + data['teen']
data['young_adult'] = data['young_adult'].apply(lambda x: 1 if x > 0 else 0)

data['childrens_fiction'] = data['childhood'] + data['children'] + data['children-s'] + data['children-s-books'] + data['childrens'] + data['childrens-books'] + data['kids'] + data['kids-books'] + data['juvenile']
data['childrens_fiction'] = data['childrens_fiction'].apply(lambda x: 1 if x > 0 else 0)

data['has_ebook'] = data['e-book'] + data['e-books'] + data['ebook'] + data['ebooks'] + data['kindle'] + data['kindle-books'] + data['nook']
data['has_ebook'] = data['has_ebook'].apply(lambda x: 1 if x > 0 else 0)

data['science_fiction'] = data['sci-fi'] + data['sci-fi-fantasy'] + data['fantasy-sci-fi'] + data['fantasy-scifi'] + data['science-fiction'] + data['science-fiction-fantasy'] + data['scifi'] + data['scifi-fantasy']
data['science_fiction'] = data['science_fiction'].apply(lambda x: 1 if x > 0 else 0)

data['is_classic'] = data['classic'] + data['classics'] + data['classic-literature']
data['is_classic'] = data['is_classic'].apply(lambda x: 1 if x > 0 else 0)

data['fantasy_fiction'] = data['fantasy'] + data['fantasy-sci-fi'] + data['fantasy-scifi']
data['fantasy_fiction'] = data['fantasy_fiction'].apply(lambda x: 1 if x > 0 else 0)

data['mystery_fiction'] = data['mysteries'] + data['mystery'] + data['mystery-crime'] + data['mystery-suspense'] + data['mystery-thriller']
data['mystery_fiction'] = data['mystery_fiction'].apply(lambda x: 1 if x > 0 else 0)

data['historical_fiction'] = data['historical'] + data['historical-fiction']
data['historical_fiction'] = data['historical_fiction'].apply(lambda x: 1 if x > 0 else 0)

data['non_fiction'] = data['non-fiction'] + data['nonfiction']
data['non_fiction'] = data['non_fiction'].apply(lambda x: 1 if x > 0 else 0)

data['dystopian_future'] = data['dystopia'] + data['dystopian']
data['dystopian_future'] = data['dystopian_future'].apply(lambda x: 1 if x > 0 else 0)

data = pd.merge(data, pd.get_dummies(data.gender),left_index=True,right_index=True)

data['death_date'] = data['death_date'].apply(lambda x: 1 if x else 0)

data['year'] = data['year'].fillna(data['year'].mean()).astype(int)

In [36]:
data['is_series'] = data['is_series'].apply(lambda x: 1 if x else 0)

In [23]:
years = pd.get_dummies(data.year)
years_columns = list(years.columns)
data = pd.merge(data,years,left_index=True,right_index=True)

In [37]:
cleaned_data = data[['year','death_date','female','male','is_series','biography','autobiography','science_fiction','romance',
                     'is_classic','comedy','coming-of-age','fantasy_fiction', 'mystery_fiction','dystopian_future',
                     'historical_fiction','realistic-fiction','drama','horror','crime','suspense','paranormal',
                     'thriller','war','contemporary','chick-lit','action','young_adult','adult','animals','action',
                     'childrens_fiction','avg_rating','num_works','has_movie']+years_columns].fillna(0)

All of this code is in my clean_df python script.