# Data Cleaning


Data Source: [CMU Book Summaries Dataset](http://www.cs.cmu.edu/~dbamman/booksummaries.html)


## Importing Necessary Libraries


In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import json
import string
import gc

## Import the Dataset


In [3]:
filename = "./res/booksummaries.txt"
data = pd.read_csv(
    filename,
    sep="\t",
    lineterminator="\n",
    names=[
        "wikipedia_ID",
        "freebase_ID",
        "title",
        "author",
        "pub_date",
        "genres",
        "summary",
    ],
)
data

Unnamed: 0,wikipedia_ID,freebase_ID,title,author,pub_date,genres,summary
0,620,/m/0hhy,Animal Farm,George Orwell,1945-08-17,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca..."
1,843,/m/0k36,A Clockwork Orange,Anthony Burgess,1962,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan..."
2,986,/m/0ldx,The Plague,Albert Camus,1947,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...
3,1756,/m/0sww,An Enquiry Concerning Human Understanding,David Hume,,,The argument of the Enquiry proceeds by a ser...
4,2080,/m/0wkt,A Fire Upon the Deep,Vernor Vinge,,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...
...,...,...,...,...,...,...,...
16554,36934824,/m/0m0p0hr,Under Wildwood,Colin Meloy,2012-09-25,,"Prue McKeel, having rescued her brother from ..."
16555,37054020,/m/04f1nbs,Transfer of Power,Vince Flynn,2000-06-01,"{""/m/01jfsb"": ""Thriller"", ""/m/02xlf"": ""Fiction""}",The reader first meets Rapp while he is doing...
16556,37122323,/m/0n5236t,Decoded,Jay-Z,2010-11-16,"{""/m/0xdf"": ""Autobiography""}",The book follows very rough chronological ord...
16557,37132319,/m/0n4bqb1,America Again: Re-becoming The Greatness We Ne...,Stephen Colbert,2012-10-02,,Colbert addresses topics including Wall Stree...


## Explore "Genres" Column


In [4]:
data["genres"]

0        {"/m/016lj8": "Roman \u00e0 clef", "/m/06nbt":...
1        {"/m/06n90": "Science Fiction", "/m/0l67h": "N...
2        {"/m/02m4t": "Existentialism", "/m/02xlf": "Fi...
3                                                      NaN
4        {"/m/03lrw": "Hard science fiction", "/m/06n90...
                               ...                        
16554                                                  NaN
16555     {"/m/01jfsb": "Thriller", "/m/02xlf": "Fiction"}
16556                         {"/m/0xdf": "Autobiography"}
16557                                                  NaN
16558    {"/m/02ql9": "Epistolary novel", "/m/014dfn": ...
Name: genres, Length: 16559, dtype: object

## Convert Genres Into a Easier-to-Process String


In [5]:
# Preparation to remove Unneccessary Punctuation
punc = string.punctuation

punc = punc.replace(",", "").replace("-", "")

punc

'!"#$%&\'()*+./:;<=>?@[\\]^_`{|}~'

In [6]:
genre = []
for i in data.index:
    if (data["genres"][i] == "") or (type(data["genres"][i]) == type(np.nan)):
        genre.append(None)
        continue
    d = json.loads(data["genres"][i])
    d = np.array(list(d.values()))
    d = ",".join(d).lower().replace(" ", "-")
    genre.append(d.replace(",", " ").translate(str.maketrans("", "", punc)))
    d = None
    gc.collect()
genre

['roman-à-clef satire childrens-literature speculative-fiction fiction',
 'science-fiction novella speculative-fiction utopian-and-dystopian-fiction satire fiction',
 'existentialism fiction absurdist-fiction novel',
 None,
 'hard-science-fiction science-fiction speculative-fiction fantasy fiction',
 'war-novel roman-à-clef',
 'childrens-literature fantasy speculative-fiction bildungsroman fiction',
 None,
 'science-fiction speculative-fiction',
 'science-fiction speculative-fiction',
 None,
 None,
 None,
 None,
 None,
 'religious-text',
 None,
 None,
 None,
 None,
 'speculative-fiction fiction novel',
 'science-fiction speculative-fiction childrens-literature fiction',
 'satire bildungsroman picaresque-novel',
 'science-fiction speculative-fiction childrens-literature fiction',
 'gothic-fiction',
 'fiction',
 'science-fiction speculative-fiction horror invasion-literature mystery epistolary-novel fantasy fiction gothic-fiction',
 'parody childrens-literature psychological-novel satire

## Confirm Changes and Export the Dataset


In [7]:
genre = np.array(genre)
data["genres"] = genre.tolist()

In [8]:
data

Unnamed: 0,wikipedia_ID,freebase_ID,title,author,pub_date,genres,summary
0,620,/m/0hhy,Animal Farm,George Orwell,1945-08-17,roman-à-clef satire childrens-literature specu...,"Old Major, the old boar on the Manor Farm, ca..."
1,843,/m/0k36,A Clockwork Orange,Anthony Burgess,1962,science-fiction novella speculative-fiction ut...,"Alex, a teenager living in near-future Englan..."
2,986,/m/0ldx,The Plague,Albert Camus,1947,existentialism fiction absurdist-fiction novel,The text of The Plague is divided into five p...
3,1756,/m/0sww,An Enquiry Concerning Human Understanding,David Hume,,,The argument of the Enquiry proceeds by a ser...
4,2080,/m/0wkt,A Fire Upon the Deep,Vernor Vinge,,hard-science-fiction science-fiction speculati...,The novel posits that space around the Milky ...
...,...,...,...,...,...,...,...
16554,36934824,/m/0m0p0hr,Under Wildwood,Colin Meloy,2012-09-25,,"Prue McKeel, having rescued her brother from ..."
16555,37054020,/m/04f1nbs,Transfer of Power,Vince Flynn,2000-06-01,thriller fiction,The reader first meets Rapp while he is doing...
16556,37122323,/m/0n5236t,Decoded,Jay-Z,2010-11-16,autobiography,The book follows very rough chronological ord...
16557,37132319,/m/0n4bqb1,America Again: Re-becoming The Greatness We Ne...,Stephen Colbert,2012-10-02,,Colbert addresses topics including Wall Stree...


In [9]:
data.to_json("./res/export.json")

---

Author: Soumyajit Kolay
