In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# Load the dataset into a DataFrame
df = pd.read_csv('dataset/BL-Flickr-Images-Book.csv')

In [3]:
# Display the first few rows of the DataFrame to understand its structure
print("Original DataFrame:")
print(df.head())

Original DataFrame:
   Identifier             Edition Statement      Place of Publication  \
0         206                           NaN                    London   
1         216                           NaN  London; Virtue & Yorston   
2         218                           NaN                    London   
3         472                           NaN                    London   
4         480  A new edition, revised, etc.                    London   

  Date of Publication              Publisher  \
0         1879 [1878]       S. Tinsley & Co.   
1                1868           Virtue & Co.   
2                1869  Bradbury, Evans & Co.   
3                1851          James Darling   
4                1857   Wertheim & Macintosh   

                                               Title     Author  \
0                  Walter Forbes. [A novel.] By A. A      A. A.   
1  All for Greed. [A novel. The dedication signed...  A., A. A.   
2  Love the Avenger. By the author of “All for Gr..

In [4]:
# Drop irrelevant columns
# Assuming 'Edition Statement', 'Corporate Author', 'Corporate Contributors', 'Former owner', 'Engraver', 
# 'Contributors', 'Issuance type', 'Shelfmarks' are irrelevant for book information
columns_to_drop = ['Edition Statement', 'Corporate Author', 'Corporate Contributors', 'Former owner',
                   'Engraver', 'Contributors', 'Issuance type', 'Shelfmarks']
df.drop(columns=columns_to_drop, inplace=True)

In [5]:
# Display the DataFrame after dropping columns
print("\nDataFrame after dropping irrelevant columns:")
print(df.head())


DataFrame after dropping irrelevant columns:
   Identifier      Place of Publication Date of Publication  \
0         206                    London         1879 [1878]   
1         216  London; Virtue & Yorston                1868   
2         218                    London                1869   
3         472                    London                1851   
4         480                    London                1857   

               Publisher                                              Title  \
0       S. Tinsley & Co.                  Walter Forbes. [A novel.] By A. A   
1           Virtue & Co.  All for Greed. [A novel. The dedication signed...   
2  Bradbury, Evans & Co.  Love the Avenger. By the author of “All for Gr...   
3          James Darling  Welsh Sketches, chiefly ecclesiastical, to the...   
4   Wertheim & Macintosh  [The World in which I live, and my place in it...   

      Author                                         Flickr URL  
0      A. A.  http://www.flickr.co

In [6]:
# Change the index of the DataFrame to 'Identifier'
df.set_index('Identifier', inplace=True)

In [7]:
# Display the DataFrame after setting the new index
print("\nDataFrame with 'Identifier' as index:")
print(df.head())


DataFrame with 'Identifier' as index:
                Place of Publication Date of Publication  \
Identifier                                                 
206                           London         1879 [1878]   
216         London; Virtue & Yorston                1868   
218                           London                1869   
472                           London                1851   
480                           London                1857   

                        Publisher  \
Identifier                          
206              S. Tinsley & Co.   
216                  Virtue & Co.   
218         Bradbury, Evans & Co.   
472                 James Darling   
480          Wertheim & Macintosh   

                                                        Title     Author  \
Identifier                                                                 
206                         Walter Forbes. [A novel.] By A. A      A. A.   
216         All for Greed. [A novel. The dedication 

In [8]:
# Clean the 'Date of Publication' field using regular expressions
# def clean_date(date):
#     if pd.isna(date):
#         return np.nan
#     date_match = re.search(r'\d{4}', date)
#     if date_match:
#         return int(date_match.group())
#     return np.nan

# df['Date of Publication'] = df['Date of Publication'].apply(clean_date)
df['Date of Publication'] = df['Date of Publication'].str.extract(r'^(\d{4})', expand = False)

In [9]:
# Display the DataFrame after cleaning 'Date of Publication'
print("\nDataFrame with cleaned 'Date of Publication':")
print(df.head())


DataFrame with cleaned 'Date of Publication':
                Place of Publication Date of Publication  \
Identifier                                                 
206                           London                1879   
216         London; Virtue & Yorston                1868   
218                           London                1869   
472                           London                1851   
480                           London                1857   

                        Publisher  \
Identifier                          
206              S. Tinsley & Co.   
216                  Virtue & Co.   
218         Bradbury, Evans & Co.   
472                 James Darling   
480          Wertheim & Macintosh   

                                                        Title     Author  \
Identifier                                                                 
206                         Walter Forbes. [A novel.] By A. A      A. A.   
216         All for Greed. [A novel. The ded

In [10]:
# Tidy up 'Place of Publication' by combining string methods with NumPy
# Simplify place names, assuming 'London' and 'Oxford' are the main places
df['Place of Publication'] = np.where(df['Place of Publication'].str.contains('London'), 'London',
                                      np.where(df['Place of Publication'].str.contains('Oxford'), 'Oxford', 
                                               df['Place of Publication']))

In [11]:
# Display the final cleaned DataFrame
print("\nFinal cleaned DataFrame:")
print(df.head())


Final cleaned DataFrame:
           Place of Publication Date of Publication              Publisher  \
Identifier                                                                   
206                      London                1879       S. Tinsley & Co.   
216                      London                1868           Virtue & Co.   
218                      London                1869  Bradbury, Evans & Co.   
472                      London                1851          James Darling   
480                      London                1857   Wertheim & Macintosh   

                                                        Title     Author  \
Identifier                                                                 
206                         Walter Forbes. [A novel.] By A. A      A. A.   
216         All for Greed. [A novel. The dedication signed...  A., A. A.   
218         Love the Avenger. By the author of “All for Gr...  A., A. A.   
472         Welsh Sketches, chiefly ecclesiasti

In [12]:
df

Unnamed: 0_level_0,Place of Publication,Date of Publication,Publisher,Title,Author,Flickr URL
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
206,London,1879,S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,http://www.flickr.com/photos/britishlibrary/ta...
216,London,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
218,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
472,London,1851,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...
480,London,1857,Wertheim & Macintosh,"[The World in which I live, and my place in it...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...
...,...,...,...,...,...,...
4158088,London,1838,,"The Parochial History of Cornwall, founded on,...","GIDDY, afterwards GILBERT, Davies.",http://www.flickr.com/photos/britishlibrary/ta...
4158128,Derby,1831,M. Mozley & Son,The History and Gazetteer of the County of Der...,"GLOVER, Stephen - of Derby",http://www.flickr.com/photos/britishlibrary/ta...
4159563,London,,T. Cadell and W. Davies,Magna Britannia; being a concise topographical...,"LYSONS, Daniel - M.A., F.R.S., and LYSONS (Sam...",http://www.flickr.com/photos/britishlibrary/ta...
4159587,Newcastle upon Tyne,1834,Mackenzie & Dent,"An historical, topographical and descriptive v...","Mackenzie, E. (Eneas)",http://www.flickr.com/photos/britishlibrary/ta...


In [1]:
import pandas as pd
import numpy as np
import re
# Load the dataset into a DataFrame
df = pd.read_csv('dataset/BL-Flickr-Images-Book.csv')
# Display the first few rows of the DataFrame to understand its structure
print("Original DataFrame:")
print(df.head())
# Drop irrelevant columns
# Assuming 'Edition Statement', 'Corporate Author', 'Corporate Contributors', 'Former owner', 'Engraver', 
# 'Contributors', 'Issuance type', 'Shelfmarks' are irrelevant for book information
columns_to_drop = ['Edition Statement', 'Corporate Author', 'Corporate Contributors', 'Former owner',
                   'Engraver', 'Contributors', 'Issuance type', 'Shelfmarks']
df.drop(columns=columns_to_drop, inplace=True)
# Display the DataFrame after dropping columns
print("\nDataFrame after dropping irrelevant columns:")
print(df.head())
# Clean the 'Date of Publication' field using regular expressions
df['Date of Publication'] = df['Date of Publication'].str.extract(r'^(\d{4})', expand = False)
# Display the DataFrame after cleaning 'Date of Publication'
print("\nDataFrame with cleaned 'Date of Publication':")
print(df.head())
# Tidy up 'Place of Publication' by combining string methods with NumPy
# Simplify place names, assuming 'London' and 'Oxford' are the main places
df['Place of Publication'] = np.where(df['Place of Publication'].str.contains('London'), 'London',
                                      np.where(df['Place of Publication'].str.contains('Oxford'), 'Oxford', 
                                               df['Place of Publication']))
# Display the final cleaned DataFrame
print("\nFinal cleaned DataFrame:")
print(df.head())

Original DataFrame:
   Identifier             Edition Statement      Place of Publication  \
0         206                           NaN                    London   
1         216                           NaN  London; Virtue & Yorston   
2         218                           NaN                    London   
3         472                           NaN                    London   
4         480  A new edition, revised, etc.                    London   

  Date of Publication              Publisher  \
0         1879 [1878]       S. Tinsley & Co.   
1                1868           Virtue & Co.   
2                1869  Bradbury, Evans & Co.   
3                1851          James Darling   
4                1857   Wertheim & Macintosh   

                                               Title     Author  \
0                  Walter Forbes. [A novel.] By A. A      A. A.   
1  All for Greed. [A novel. The dedication signed...  A., A. A.   
2  Love the Avenger. By the author of “All for Gr..