In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import os

In [2]:
# Reading out all the files to be examined ----------------------------------------------------------------
Ratings = pd.read_csv('users-books-dataset/BX-Book-Ratings.csv', sep=';',encoding='latin-1')
Users = pd.read_csv('users-books-dataset/BX-Users.csv', sep=';',encoding='latin-1')
Books = pd.read_csv('users-books-dataset/BX-Books.csv', sep=';',encoding='latin-1',escapechar='\\')

In [3]:
# Combining dataframes to get the new as a whole data sets
# here we'll merge based on ISBN and User id and the type will be inner join.
Result=(Ratings.merge(Books, on=['ISBN'])).merge(Users, on=['User-ID'])
Result.set_index('ISBN', inplace=True)

In [4]:
# Checking for duplicate values for all column values matching
# if any duplicate found, then sum it.
(Result.duplicated()==True).sum()

0

In [33]:
# Same for false value that is not duplicate
# Or just confirming wheather the false count is same as the size of series

# Comment either of below statement to get results either in boolean or in integer
# ---------------------------------------------------------

# (Result.duplicated()==False).sum()
(Result.duplicated()==False).sum()==Result.count()

User-ID                 True
Book-Rating             True
Book-Title              True
Book-Author            False
Year-Of-Publication     True
Publisher              False
Image-URL-S             True
Image-URL-M             True
Image-URL-L             True
Location                True
Age                     True
dtype: bool

In [6]:
# check for inconsistency
Result['Year-Of-Publication'].max()

2050

In [7]:
# check for inconsistency
Result['Age'].max()

244.0

In [8]:
# Now we'll check for inconsistent datas 
# For that we need sample of some 10 items to interpret our assumptions
Result.sample(10)

Unnamed: 0_level_0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Location,Age
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
380784548,153662,0,Valley of the Shadow,Charlotte Hughes,1998,Avon,http://images.amazon.com/images/P/0380784548.0...,http://images.amazon.com/images/P/0380784548.0...,http://images.amazon.com/images/P/0380784548.0...,"ft. stewart, georgia, usa",44.0
679723692,127129,0,Cathedral (Vintage Contemporaries (Paperback)),Raymond Carver,1989,Vintage Books USA,http://images.amazon.com/images/P/0679723692.0...,http://images.amazon.com/images/P/0679723692.0...,http://images.amazon.com/images/P/0679723692.0...,"langley, british columbia, canada",
786887435,178950,0,The Life Strategies Self-Discovery Journal: Fi...,Phillip C. McGraw,2001,Hyperion Press,http://images.amazon.com/images/P/0786887435.0...,http://images.amazon.com/images/P/0786887435.0...,http://images.amazon.com/images/P/0786887435.0...,"orlando, florida, usa",
671456105,129074,0,"Star Trek II The Wrath of Kahn (Star Trek, No 7)",Vonda N. McIntyre (Adapter),1982,Star Trek,http://images.amazon.com/images/P/0671456105.0...,http://images.amazon.com/images/P/0671456105.0...,http://images.amazon.com/images/P/0671456105.0...,"marietta, georgia, usa",51.0
1550821733,44297,0,Prosody at the Cafe Du Coin,Jeff Bien,1996,Quarry Press,http://images.amazon.com/images/P/1550821733.0...,http://images.amazon.com/images/P/1550821733.0...,http://images.amazon.com/images/P/1550821733.0...,"christchurch, canterbury, new zealand",22.0
843943491,212965,5,Savage Heat (Savage),Cassie Edwards,1998,Leisure Books,http://images.amazon.com/images/P/0843943491.0...,http://images.amazon.com/images/P/0843943491.0...,http://images.amazon.com/images/P/0843943491.0...,"akron,, ohio, usa",43.0
802430481,208099,5,How to Get Kids to Help at Home,Elva Anson,1989,Moody Pr,http://images.amazon.com/images/P/0802430481.0...,http://images.amazon.com/images/P/0802430481.0...,http://images.amazon.com/images/P/0802430481.0...,"skamokawa, washington, usa",36.0
373512120,172030,0,Wolf In Waiting (Reader's Choice),Rebecca Flanders,2002,Silhouette,http://images.amazon.com/images/P/0373512120.0...,http://images.amazon.com/images/P/0373512120.0...,http://images.amazon.com/images/P/0373512120.0...,"rainsville, alabama, usa",30.0
2207303012,201365,0,Jusqu'Ã?Â la quatriÃ?Â¨me gÃ?Â©nÃ?Â©ration,Isaac Asimov,1986,DenoÃ?Â«l,http://images.amazon.com/images/P/2207303012.0...,http://images.amazon.com/images/P/2207303012.0...,http://images.amazon.com/images/P/2207303012.0...,"n/a, n/a, france",50.0
874060079,216683,0,Six Months to Live,McDaniel,1985,Pages Publishing Group,http://images.amazon.com/images/P/0874060079.0...,http://images.amazon.com/images/P/0874060079.0...,http://images.amazon.com/images/P/0874060079.0...,"san jose, california, usa",


In [24]:
# Limit values by providing constraints
age_Limit = [18, 80]
year = [1900, 2020]
Result.drop(Result[Result['Age'] > age_Limit[1]].index, inplace=True)
Result.drop(Result[Result['Age'] < age_Limit[0]].index, inplace=True)
Result.drop(Result[Result['Year-Of-Publication'] < year[0]].index, inplace=True)
Result.drop(Result[Result['Year-Of-Publication'] > year[1]].index, inplace=True)

In [34]:
# check value to assure its legit
# comment either of them to get results
# -------------------------------------------------------

Result['Year-Of-Publication'].min()
Result['Year-Of-Publication'].max()


1900

In [11]:
# check value to assure its legit
# comment either of them to get results
# -------------------------------------------------------

Result['Age'].max()
Result['Age'].min()

80.0

In [12]:
# Finding missing values and identifying outliers

In [26]:
(Result[:].isnull()==True).sum()

User-ID                0
Book-Rating            0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            0
Location               0
Age                    0
dtype: int64

In [21]:
Result['Age'].interpolate(inplace=True)

In [27]:
(Result[:].isnull()==True).sum()

User-ID                0
Book-Rating            0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            0
Location               0
Age                    0
dtype: int64