In [1]:
import pandas as pd
from string import punctuation
import os
import re
import numpy as np
import collections
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
# Reading initial csv file
init_df = pd.read_csv("BX-CSV-Dump/BX-Books.csv", sep=';', 
                      error_bad_lines=False, encoding="latin_1", low_memory=False)

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\nSkipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\nSkipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\nSkipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'


In [3]:
# Cleaning text in ['Book-Author']
init_df['Book-Author'] = init_df['Book-Author'].str.replace(" &amp;", "")\
                        .str.replace("&amp;", " ").str.replace('"', '')\
                        .str.strip("'").str.strip(" ")\
                        .str.replace(" and", ',').str.title()

In [4]:
# Propagating rows with books that have several authors
df = init_df.copy()
cols = df.columns
df = df.join(df.pop('Book-Author')
             .str.split(',', expand=True)
             .stack().str.strip()
             .reset_index(drop=True, level=1)
             .rename('Book-Author')           
             ).reset_index(drop=True)\
                .reindex(cols, axis=1)

In [5]:
# Authors dataset generation
authors_df = df[["Book-Author"]].copy()

authors_df = authors_df.drop_duplicates().\
                        reset_index(drop=True)
authors_df = authors_df[authors_df['Book-Author'] != '']
authors_df.dropna(inplace=True)
# authors_df = authors_df[authors_df['Book-Author'].apply(lambda x:
#                                                         len(x) > 2)]
authors_df = authors_df[authors_df['Book-Author'].apply(lambda x:
                                                        len(x.split(" ")) > 1)]
authors_df["Id"] = range(authors_df.shape[0])

# Books dataset generation
books_df = init_df.copy()
books_df.drop("Book-Author", axis=1, inplace=True)
books_df.drop(["Image-URL-S", "Image-URL-M", "Image-URL-L"], # for testing purposes
                                      axis=1, inplace=True) # for testing purposes
books_df.set_index("ISBN", drop=True, inplace=True)

# Authors-books dataset generation
authors_books_df = df[['ISBN', 'Book-Author']].copy()
authors_books_df["Book-Author-Id"] = authors_books_df['Book-Author']\
                                    .map(authors_df
                                    .set_index('Book-Author')['Id']
                                    ).fillna(0)
authors_df.set_index('Id', drop=True, inplace=True)

In [74]:
# Saving to CSV
authors_df.to_csv('authors.csv', index=True)
books_df.to_csv('books.csv', index=True)
authors_books_df.to_csv('authors_books.csv', index=True)

In [15]:
books_df.head()

Unnamed: 0_level_0,Book-Title,Year-Of-Publication,Publisher
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
195153448,Classical Mythology,2002,Oxford University Press
2005018,Clara Callan,2001,HarperFlamingo Canada
60973129,Decision in Normandy,1991,HarperPerennial
374157065,Flu: The Story of the Great Influenza Pandemic...,1999,Farrar Straus Giroux
393045218,The Mummies of Urumchi,1999,W. W. Norton &amp; Company


In [6]:
# Reading site data scrapped information
path = os.path.join(os.path.expanduser('~'), 'Documents', 
                    'trainee_projects', "scrapping", 
                    'SITE_DATA.csv')
site_df = pd.read_csv(path, sep=',')

In [7]:
site_df["description"] = site_df["description"].str.strip("'")\
                                                .str.strip('"')

In [8]:
authors = authors_df['Book-Author'].values.tolist()
esc_list = []
for a in authors:
    x = re.escape(a)
    x =  '\\b' + x + '\\b'
    esc_list.append(x)
pattern = ('|').join(esc_list)

In [78]:
# looking for authors' matches in book's description
matches = site_df['description'].str.findall(r"{}".format(pattern), flags=re.IGNORECASE)

In [79]:
mat = matches.copy()
mat = mat.str.join('|')\
            .str.split('|', expand=True)\
            .stack()\
            .reset_index(drop=True, level=1)\
            .dropna(how='all')\
            .str.title()
mat = mat.rename("Book-Author")

In [80]:
temp_df = authors_df.copy()
temp_df = temp_df.set_index('Book-Author')
temp_df['Id'] = np.arange(temp_df.shape[0], dtype=np.uint32)

mat = mat.to_frame()
mat['Book-Author-Id'] = mat['Book-Author'].map(temp_df['Id'])
mat.drop('Book-Author', axis=1, inplace=True)
mat.dropna(inplace=True)

0                    
1          John Locke
1    John Stuart Mill
1            Ayn Rand
2        Mary Shelley
Name: Book-Author, dtype: object

In [83]:
### Generating "Mentioned authors" dataset
mention_df = site_df.copy()
mention_df = mention_df.join(mat).reset_index(drop=True)
mention_df.drop_duplicates(inplace=True)
mention_df.dropna(subset = ['Book-Author-Id'], inplace=True)
mention_df.reset_index(drop=True, inplace=True)
mention_df.drop('title', axis=1, inplace=True)
mention_df['Book-Author-Id'] = mention_df['Book-Author-Id'].astype(int)
mention_df.head()
# extracting two sentences for description
split_desc_df = mention_df.description\
                        .str.replace('?', '.')\
                        .str.replace('!', '.')\
                        .str.split('.', expand=True)
mention_df.description = split_desc_df[0].str.cat(split_desc_df[1], sep=".")
mention_df['Book-Author'] = mention_df['Book-Author-Id'].map(authors_df['Book-Author'])
# saving to csv file
mention_df.to_csv("mentioned_authors.csv", index=True, header=True)

In [9]:
mention_df = pd.read_csv("mentioned_authors.csv")
mention_df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)

In [10]:
mention_df

Unnamed: 0,url,description,Book-Author-Id,Book-Author
0,http://books.toscrape.com/catalogue/libertaria...,Libertarianism isn't about winning elections; ...,17522,John Locke
1,http://books.toscrape.com/catalogue/libertaria...,Libertarianism isn't about winning elections; ...,7147,John Stuart Mill
2,http://books.toscrape.com/catalogue/libertaria...,Libertarianism isn't about winning elections; ...,1060,Ayn Rand
3,http://books.toscrape.com/catalogue/mesaerion-...,"Andrew Barger, award-winning author and engine...",4324,Mary Shelley
4,http://books.toscrape.com/catalogue/shakespear...,This book is an important and complete collect...,313,William Shakespeare
...,...,...,...,...
772,http://books.toscrape.com/catalogue/charlie-an...,Complete with stunning new movie art for the c...,2817,Tim Burton
773,http://books.toscrape.com/catalogue/choosing-o...,"To the dismay of religious leaders, study afte...",9498,United States
774,http://books.toscrape.com/catalogue/eat-pray-l...,"In her early thirties, Elizabeth Gilbert had e...",9686,Elizabeth Gilbert
775,http://books.toscrape.com/catalogue/emma_17/in...,"I never have been in love; it is not my way, o...",40,Jane Austen


In [11]:
# Temporary dataset to count ground truth and predicted results
temp_books = init_df.drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L', 
                 'Year-Of-Publication', 'Publisher'], axis=1)
# temp_books.to_csv("temp_books.csv", index=True, header=True)

In [13]:
### Accuracy calculation is based on analyzing first 50 mentioned authors in SITE_DATA.csv
y_true = pd.read_csv("y_true.csv").val.tolist()
y_pred = pd.read_csv("y_pred.csv").val.tolist()
print(f"Confusion matrix:\n{confusion_matrix(y_true, y_pred)}")
print(f"Accuracy: {accuracy_score(y_true, y_pred)}")

Confusion matrix:
[[15  6]
 [ 2 27]]
Accuracy: 0.84


In [14]:
### Defining the most mentioned author 
most_mentioned_author_id = mention_df['Book-Author-Id'].value_counts().idxmax()
most_mentioned_author = authors_df['Book-Author'].iloc[most_mentioned_author_id]
most_mentioned_author

'New York Times'

In [30]:
### Defining missing books in books.csv
df1 = authors_books_df.copy()
df1['Book-Title'] = df1['ISBN'].map(books_df['Book-Title'].str.title())
df1['Book-Author-Id'] = df1['Book-Author-Id'].astype(int)

df2 = mention_df.copy()
df2['Book-Title'] = df2['url'].map(site_df.set_index('url', 
                                                     drop=True)['title'])
df2['Book-Title'] = df2['Book-Title'].str.title()

# Looking for matches by Book-Author and Book-Title values
df3 = df2.merge(df1, on=['Book-Author', 'Book-Title'], how='inner')
df3 = df3.drop_duplicates('url').reset_index(drop=True)

# Extracting books from mentioned_authors that were not
# found in books.csv
mask = np.logical_not(df2.url.isin(df3['url']))
df2[mask].drop_duplicates('url')\
                        .drop(['Book-Author-Id', 'Book-Author'], axis=1)\
                        .reset_index(drop=True)

Unnamed: 0,url,description,Book-Title
0,http://books.toscrape.com/catalogue/libertaria...,Libertarianism isn't about winning elections; ...,Libertarianism For Beginners
1,http://books.toscrape.com/catalogue/mesaerion-...,"Andrew Barger, award-winning author and engine...",Mesaerion: The Best Science Fiction Stories 18...
2,http://books.toscrape.com/catalogue/the-black-...,"Praise for Aracelis Girmay:""[Girmay's] every l...",The Black Maria
3,http://books.toscrape.com/catalogue/the-boys-i...,For readers of Laura Hillenbrand's Seabiscuit ...,The Boys In The Boat: Nine Americans And Their...
4,http://books.toscrape.com/catalogue/the-coming...,"If you have a heart, if you have a soul, Karen...",The Coming Woman: A Novel Based On The Life Of...
...,...,...,...
393,http://books.toscrape.com/catalogue/blood-defe...,First in a new series from bestselling author ...,Blood Defense (Samantha Brinkman #1)
394,http://books.toscrape.com/catalogue/bridget-jo...,Meet Bridget Jones—a 30-something Singleton wh...,Bridget Jones'S Diary (Bridget Jones #1)
395,http://books.toscrape.com/catalogue/charlie-an...,Complete with stunning new movie art for the c...,Charlie And The Chocolate Factory (Charlie Buc...
396,http://books.toscrape.com/catalogue/choosing-o...,"To the dismay of religious leaders, study afte...",Choosing Our Religion: The Spiritual Lives Of ...


In [22]:
books_df.head()

Unnamed: 0_level_0,Book-Title,Year-Of-Publication,Publisher
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
195153448,Classical Mythology,2002,Oxford University Press
2005018,Clara Callan,2001,HarperFlamingo Canada
60973129,Decision in Normandy,1991,HarperPerennial
374157065,Flu: The Story of the Great Influenza Pandemic...,1999,Farrar Straus Giroux
393045218,The Mummies of Urumchi,1999,W. W. Norton &amp; Company


In [16]:
### Looking for 10 frequent words in book titles from books.csv
titles = books_df['Book-Title'].copy()

for p in punctuation:
    titles = titles.str.replace(p, '')
    
titles = titles.str.replace("  ", ' ')   
titles = titles.str.strip()
titles = titles.str.lower()
titles = titles.str.split(' ')

titles_list = titles.tolist()   

In [20]:
word_dict = {}
non_mean_words = {'the', 'a', 'an', 'of', 'for', 'in',
                 'at', 'on', 'to', 'and', 'no', 'amp',
                 'from', 'with'}

for title in titles_list:
    for word in title:
        if (word not in non_mean_words and
                not word.isdigit()):
            if word not in word_dict:
                word_dict[word] = 1
            else:
                word_dict[word] += 1
        else:
            pass
        
sorted_words = sorted(word_dict.items(), 
                      reverse=True,
                      key=lambda kv: kv[1])

In [21]:
word_num = 10
for word, number in sorted_words[:word_num]:
    print("'{}' has {} occurences".format(word, number))

'book' has 12110 occurences
'guide' has 7132 occurences
'novel' has 6639 occurences
'life' has 6120 occurences
'series' has 6055 occurences
'your' has 5766 occurences
'harlequin' has 5640 occurences
'love' has 5212 occurences
'stories' has 4947 occurences
'new' has 4649 occurences


In [40]:
### Creating book_description.csv
df1 = authors_books_df.copy()
df1['Book-Title'] = df1['ISBN'].map(books_df['Book-Title'].str.title())
df1['Book-Author-Id'] = df1['Book-Author-Id'].astype(int)

df2 = mention_df.copy()
df2['Book-Title'] = df2['url'].map(site_df.set_index('url', 
                                                     drop=True)['title'])
df2['Book-Title'] = df2['Book-Title'].str.title()

# Looking for matches by Book-Author and Book-Title values
df3 = df1.merge(df2, on=['Book-Author', 'Book-Title'], how='inner')
df3 = df3.drop_duplicates('url')
df3 = df3.drop_duplicates('ISBN').reset_index(drop=True)

# Mapping books' urls to books descriptions
df3['Description'] = df3['url'].map(site_df.set_index('url', 
                                                     drop=True)['description'])
df3 = df3.drop(['Book-Author-Id_x', 'Book-Author-Id_y', 'url', 'description'], axis=1)
df3.to_csv("book_description.csv", index=True, header=True)