In [59]:
import pandas as pd
import sklearn

In [60]:
import pickle

In [61]:
import logging

In [62]:
logging.basicConfig(filename="log_file.log",
                    format='%(asctime)s %(message)s',
                    filemode='w')

In [63]:
logging.warning("Its a Warning")
logging.error("Did you try to divide by zero")

In [64]:
import pickle

In [65]:
books = pd.read_csv('books_new.csv')

In [66]:
books.head()

Unnamed: 0,Title,Author,Genre,SubGenre,Height,Publisher
0,Fundamentals of Wavelets,"Goswami, Jaideva",tech,signal_processing,228,Wiley
1,Data Smart,"Foreman, John",tech,data_science,235,Wiley
2,God Created the Integers,"Hawking, Stephen",tech,mathematics,197,Penguin
3,Superfreakonomics,"Dubner, Stephen",science,economics,179,HarperCollins
4,Orientalism,"Said, Edward",nonfiction,history,197,Penguin


# Analysing Data

In [67]:
#Publisher does not hold much significance when it comes to selecting books
books.drop_duplicates(subset='Title', keep="first", inplace=True)

In [68]:
books = books.reset_index(drop=True)

In [69]:
books.tail(40)

Unnamed: 0,Title,Author,Genre,SubGenre,Height,Publisher
170,Hafasavnuk,Deshpande P L,fiction,novel,211,
171,Urlasurla,Deshpande P L,fiction,novel,211,
172,Pointers in C,"Kanetkar, Yashwant",tech,computer_science,213,
173,"Cathedral and the Bazaar, The","Raymond, Eric",tech,computer_science,217,
174,Design with OpAmps,"Franco, Sergio",tech,computer_science,240,
175,Think Complexity,"Downey, Allen",tech,data_science,230,
176,"Devil's Advocate, The","West, Morris",fiction,novel,178,
177,Ayn Rand Answers,"Rand, Ayn",philosophy,objectivism,203,
178,Philosophy: Who Needs It,"Rand, Ayn",philosophy,objectivism,171,
179,"World's Great Thinkers, The",,science,physics,189,


In [70]:
def collapse(obj):
    if ',' in obj:
        sep=obj.split(", ")
        obj = sep[1]+" "+sep[0]
    return obj

In [71]:
# Writing titles after removing commas.
books['Title'] = books['Title'].apply(collapse)

In [72]:
# checking for null values in author
books[books['Author'].isnull()==True]

Unnamed: 0,Title,Author,Genre,SubGenre,Height,Publisher
80,Beyond Degrees,,philosophy,education,222,HarperCollins
83,The World's Greatest Trials,,nonfiction,history,210,
103,The World's Greatest Short Stories,,fiction,classic,217,Jaico
113,Selected Short Stories,,fiction,classic,215,Jaico
118,Karl Marx Biography,,nonfiction,autobiography,162,
129,Political Philosophers,,philosophy,politics,162,
138,Final Crisis,,fiction,comic,257,
139,The Killing Joke,,fiction,comic,283,
140,Flashpoint,,fiction,comic,265,
141,Batman Earth One,,fiction,comic,265,


In [73]:
# Filling NULL values 
books.fillna("Not available",inplace=True)

In [74]:
books[books['Author'].isnull()==True]

Unnamed: 0,Title,Author,Genre,SubGenre,Height,Publisher


In [75]:
#Witing author names after emoving commas
books['Author'] = books['Author'].apply(collapse)

In [121]:
books = books.set_index('Title')

In [123]:
pickle.dump(books,open('books.pkl','wb'))

In [78]:
#Separating Genre and SubGenre for further processing
books_genre = books[['Title','Genre','SubGenre']]

In [79]:
books_genre

Unnamed: 0,Title,Genre,SubGenre
0,Fundamentals of Wavelets,tech,signal_processing
1,Data Smart,tech,data_science
2,God Created the Integers,tech,mathematics
3,Superfreakonomics,science,economics
4,Orientalism,nonfiction,history
...,...,...,...
205,Structure and Randomness,science,mathematics
206,Image Processing with MATLAB,tech,signal_processing
207,Animal Farm,fiction,classic
208,The Idiot,fiction,classic


In [80]:
def convert(obj):
    if '_' in obj:
        sep=obj.split("_")
        obj = sep[0]+sep[1]
    return obj

In [81]:
# Updating subGenre to remove underscore(_)
books_genre['SubGenre'] = books_genre['SubGenre'].apply(convert)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_genre['SubGenre'] = books_genre['SubGenre'].apply(convert)


# Creating model 

In [82]:
from sklearn.feature_extraction.text import CountVectorizer

In [83]:
cv = CountVectorizer()

In [84]:
books_genre['tags'] = books_genre['Genre']+" "+books_genre['SubGenre']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_genre['tags'] = books_genre['Genre']+" "+books_genre['SubGenre']


In [85]:
books_genre[books_genre['Title']=='Oxford book of Modern Science Writing']

Unnamed: 0,Title,Genre,SubGenre,tags
183,Oxford book of Modern Science Writing,science,science,science science


In [86]:
vector = cv.fit_transform(books_genre['tags']).toarray()

In [87]:
cv.get_feature_names()



['anthology',
 'autobiography',
 'classic',
 'comic',
 'computerscience',
 'datascience',
 'economics',
 'education',
 'fiction',
 'history',
 'legal',
 'mathematics',
 'misc',
 'nonfiction',
 'novel',
 'objectivism',
 'philosophy',
 'physics',
 'poetry',
 'politics',
 'psychology',
 'science',
 'signalprocessing',
 'tech',
 'trivia']

In [88]:
cv.vocabulary_

{'tech': 23,
 'signalprocessing': 22,
 'datascience': 5,
 'mathematics': 11,
 'science': 21,
 'economics': 6,
 'nonfiction': 13,
 'history': 9,
 'psychology': 20,
 'fiction': 8,
 'classic': 2,
 'computerscience': 4,
 'novel': 14,
 'philosophy': 16,
 'autobiography': 1,
 'physics': 17,
 'objectivism': 15,
 'trivia': 24,
 'misc': 12,
 'poetry': 18,
 'education': 7,
 'anthology': 0,
 'politics': 19,
 'comic': 3,
 'legal': 10}

In [89]:
books_genre['SubGenre'].value_counts()

novel               37
history             34
classic             26
datascience         14
comic               13
mathematics         11
economics           11
autobiography       11
computerscience     10
signalprocessing     7
misc                 7
physics              7
philosophy           6
objectivism          3
psychology           3
science              2
education            2
legal                2
trivia               1
poetry               1
anthology            1
politics             1
Name: SubGenre, dtype: int64

In [90]:
books_genre['Genre'].value_counts()

fiction       76
nonfiction    58
tech          36
science       23
philosophy    17
Name: Genre, dtype: int64

In [91]:
vector

array([[0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int64)

In [92]:
from sklearn.metrics.pairwise import cosine_similarity

In [93]:
similarity = cosine_similarity(vector)

In [94]:
similarity

array([[1. , 0.5, 0.5, ..., 0. , 0. , 0. ],
       [0.5, 1. , 0.5, ..., 0. , 0. , 0. ],
       [0.5, 0.5, 1. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 1. , 1. , 1. ],
       [0. , 0. , 0. , ..., 1. , 1. , 1. ],
       [0. , 0. , 0. , ..., 1. , 1. , 1. ]])

In [128]:
def recommend(book):
    index = books_genre[books_genre['Title'] == book].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances:
        if(i[1]>0.9):
            print(books_genre.iloc[i[0]].Title)
        else:
            break

In [130]:
recommend('Oxford book of Modern Science Writing')

Oxford book of Modern Science Writing


In [97]:
def convert2(obj):
    if '_' in obj:
        sep=obj.split("_")
        obj = sep[0]+" "+sep[1]
    return obj

In [98]:
books['SubGenre'] = books['SubGenre'].apply(convert2)

In [99]:
dataGS = books.groupby(['Genre'])['SubGenre'].apply(list)

In [100]:
data_genres=[]
for genre in dataGS.keys():
    data_genres.append(genre)
    lst = []
    for i in dataGS[genre]:
        if i not in lst:
            lst.append(i)
    dataGS[genre]= lst

In [101]:
dataGS

Genre
fiction                                 [classic, novel, comic]
nonfiction    [history, psychology, autobiography, trivia, m...
philosophy    [science, objectivism, history, economics, edu...
science              [economics, mathematics, physics, science]
tech          [signal processing, data science, mathematics,...
Name: SubGenre, dtype: object

In [102]:
dataGTitle = books_genre.groupby(['Genre'])['Title'].apply(list)

In [103]:
dataGTitle = dataGTitle.to_dict()

In [104]:
dataGTitle

{'fiction': ['Slaughterhouse Five',
  'The Trial',
  'The New Machiavelli',
  'The Outsider',
  'The - Vol I Complete Sherlock Holmes',
  'The - Vol II Complete Sherlock Holmes',
  'The Pillars of the Earth',
  'A Farewell to Arms',
  'The Veteran',
  'False Impressions',
  'Jurassic Park',
  'Tales of Mystery and Imagination',
  'Asami Asami',
  'Journal of a Novel',
  'The Moon is Down',
  'The Brethren',
  'In a Free State',
  'Catch 22',
  'The Amulet of Samarkand',
  'Crime and Punishment',
  'Angels & Demons',
  'Sea of Poppies',
  'A Raisin in the Sun',
  'A Prisoner of Birth',
  'The Great Indian Novel',
  'The City of Joy',
  'The Winter of Our Discontent',
  'The Case of the Lame Canary',
  'The Hunchback of Notre Dame',
  'Burning Bright',
  'Doctor in the Nude',
  "The World's Greatest Short Stories",
  "Vol 3 Maugham's Collected Short Stories",
  'The Phantom of Manhattan',
  'Ashenden of The British Agent',
  'We the Living',
  'Selected Short Stories',
  'To Sir With Lov

In [105]:
dataGS = dataGS.to_dict()

In [106]:
dataGS

{'fiction': ['classic', 'novel', 'comic'],
 'nonfiction': ['history',
  'psychology',
  'autobiography',
  'trivia',
  'misc',
  'poetry',
  'economics',
  'anthology',
  'legal'],
 'philosophy': ['science',
  'objectivism',
  'history',
  'economics',
  'education',
  'philosophy',
  'autobiography',
  'politics',
  'psychology'],
 'science': ['economics', 'mathematics', 'physics', 'science'],
 'tech': ['signal processing',
  'data science',
  'mathematics',
  'computer science',
  'economics']}

In [107]:
fiction = pd.Series(dataGS['fiction'])

In [108]:
dataGTitle.keys()

dict_keys(['fiction', 'nonfiction', 'philosophy', 'science', 'tech'])

In [109]:
pickle.dump(dataGS,open('dataGS.pkl','wb'))

In [110]:
option = "signal processing"

In [111]:
if " " in option:
        option = option.split()
        option = option[0]+option[1]

In [112]:
option

'signalprocessing'

In [113]:
pickle.dump(dataGTitle,open('dataGTitle.pkl','wb'))

In [124]:
bf = books.loc[dataGTitle['fiction']]

In [125]:
bf

Unnamed: 0_level_0,Author,Genre,SubGenre,Height,Publisher
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Slaughterhouse Five,Kurt Vonnegut,fiction,classic,198,Random House
The Trial,Frank Kafka,fiction,classic,198,Random House
The New Machiavelli,H. G. Wells,fiction,novel,180,Penguin
The Outsider,Albert Camus,fiction,classic,198,Penguin
The - Vol I Complete Sherlock Holmes,Arthur Conan Doyle,fiction,classic,176,Random House
...,...,...,...,...,...
Girl who played with Fire,Steig Larsson,fiction,novel,179,Not available
Batman Handbook,Not available,fiction,comic,270,Not available
Animal Farm,George Orwell,fiction,classic,180,Not available
The Idiot,Fyodor Dostoevsky,fiction,classic,197,Not available


In [144]:
dataGTitle['tech']

['Fundamentals of Wavelets',
 'Data Smart',
 'God Created the Integers',
 'The Nature of Statistical Learning Theory',
 'Image Processing & Mathematical Morphology',
 'Data Scientists at Work',
 'Structure & Interpretation of Computer Programs',
 "Statistical Decision Theory'",
 'Data Mining Handbook',
 'Making Software',
 'Vol I Analysis',
 'Machine Learning for Hackers',
 'The Signal and the Noise',
 'Python for Data Analysis',
 'Introduction to Algorithms',
 'Soft Computing & Intelligent Systems',
 'Textbook of Economic Theory',
 'Econometric Analysis',
 'Learning OpenCV',
 'Data Structures Using C & C++',
 'A Modern Approach Computer Vision',
 'Principles of Communication Systems',
 'Let Us C',
 'Vol 39 No. 1 Social Choice & Welfare',
 'Pattern Classification',
 'Elements of Information Theory',
 'Power Electronics - Rashid',
 'Power Electronics - Mohan',
 'Neural Networks',
 'Statistical Learning Theory',
 'Pointers in C',
 'The Cathedral and the Bazaar',
 'Design with OpAmps',
 '

In [145]:
bf = books.loc[dataGTitle['tech']]

In [146]:
bf

Unnamed: 0_level_0,Author,Genre,SubGenre,Height,Publisher
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fundamentals of Wavelets,Jaideva Goswami,tech,signal processing,228,Wiley
Data Smart,John Foreman,tech,data science,235,Wiley
God Created the Integers,Stephen Hawking,tech,mathematics,197,Penguin
The Nature of Statistical Learning Theory,Vladimir Vapnik,tech,data science,230,Springer
Image Processing & Mathematical Morphology,Frank Shih,tech,signal processing,241,CRC
Data Scientists at Work,Sebastian Gutierrez,tech,data science,230,Apress
Structure & Interpretation of Computer Programs,Gerald Sussman,tech,computer science,240,MIT Press
Statistical Decision Theory',John Pratt,tech,data science,236,MIT Press
Data Mining Handbook,Robert Nisbet,tech,data science,242,Apress
Making Software,Andy Oram,tech,computer science,232,O'Reilly


In [142]:
bf.loc[bf['SubGenre']=='comic']

Unnamed: 0_level_0,Author,Genre,SubGenre,Height,Publisher
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Final Crisis,Not available,fiction,comic,257,Not available
The Killing Joke,Not available,fiction,comic,283,Not available
Flashpoint,Not available,fiction,comic,265,Not available
Batman Earth One,Not available,fiction,comic,265,Not available
Crisis on Infinite Earths,Not available,fiction,comic,258,Not available
Superman Earth One - 1,Not available,fiction,comic,259,Not available
Superman Earth One - 2,Not available,fiction,comic,258,Not available
Justice League: Throne of Atlantis,Not available,fiction,comic,258,Not available
Justice League: The Villain's Journey,Not available,fiction,comic,258,Not available
The Death of Superman,Not available,fiction,comic,258,Not available


In [153]:
pickle.dump(books_genre,open('books_title.pkl','wb'))

In [154]:
pickle.dump(similarity,open('similarity.pkl','wb'))