In [14]:
import pandas as pd
import numpy as np
import re

file_path = 'BL-Flickr-Images-Book.csv'
df = pd.read_csv(file_path)

print("Original DataFrame:")
print(df.head())

Original DataFrame:
   Identifier             Edition Statement      Place of Publication  \
0         206                           NaN                    London   
1         216                           NaN  London; Virtue & Yorston   
2         218                           NaN                    London   
3         472                           NaN                    London   
4         480  A new edition, revised, etc.                    London   

  Date of Publication              Publisher  \
0         1879 [1878]       S. Tinsley & Co.   
1                1868           Virtue & Co.   
2                1869  Bradbury, Evans & Co.   
3                1851          James Darling   
4                1857   Wertheim & Macintosh   

                                               Title     Author  \
0                  Walter Forbes. [A novel.] By A. A      A. A.   
1  All for Greed. [A novel. The dedication signed...  A., A. A.   
2  Love the Avenger. By the author of “All for Gr..

In [15]:
irrelevant_columns = ['Edition Statement', 'Corporate Author', 'Corporate Contributors', 'Former owner', 'Engraver', 'Contributors', 'Shelfmarks']
df.drop(columns=irrelevant_columns, inplace=True)

print("\nDataFrame after dropping irrelevant columns:")
print(df.head())


DataFrame after dropping irrelevant columns:
   Identifier      Place of Publication Date of Publication  \
0         206                    London         1879 [1878]   
1         216  London; Virtue & Yorston                1868   
2         218                    London                1869   
3         472                    London                1851   
4         480                    London                1857   

               Publisher                                              Title  \
0       S. Tinsley & Co.                  Walter Forbes. [A novel.] By A. A   
1           Virtue & Co.  All for Greed. [A novel. The dedication signed...   
2  Bradbury, Evans & Co.  Love the Avenger. By the author of “All for Gr...   
3          James Darling  Welsh Sketches, chiefly ecclesiastical, to the...   
4   Wertheim & Macintosh  [The World in which I live, and my place in it...   

      Author Issuance type                                         Flickr URL  
0      A. A.   monog

In [16]:
df.set_index('Identifier', inplace=True)

print("\nDataFrame after setting new index (Identifier):")
print(df.head())


DataFrame after setting new index (Identifier):
                Place of Publication Date of Publication  \
Identifier                                                 
206                           London         1879 [1878]   
216         London; Virtue & Yorston                1868   
218                           London                1869   
472                           London                1851   
480                           London                1857   

                        Publisher  \
Identifier                          
206              S. Tinsley & Co.   
216                  Virtue & Co.   
218         Bradbury, Evans & Co.   
472                 James Darling   
480          Wertheim & Macintosh   

                                                        Title     Author  \
Identifier                                                                 
206                         Walter Forbes. [A novel.] By A. A      A. A.   
216         All for Greed. [A novel. The d

In [17]:
def clean_date(date):
    if pd.isna(date):
        return np.nan
    match = re.search(r'\d{4}', str(date))
    if match:
        return int(match.group(0))
    return np.nan

df['Date of Publication'] = df['Date of Publication'].apply(clean_date)

print("\nDataFrame after cleaning 'Date of Publication':")
print(df[['Date of Publication']].head())


DataFrame after cleaning 'Date of Publication':
            Date of Publication
Identifier                     
206                      1879.0
216                      1868.0
218                      1869.0
472                      1851.0
480                      1857.0


In [18]:
df['Place of Publication'] = np.where(df['Place of Publication'].str.contains('London'), 'London', df['Place of Publication'])
df['Place of Publication'] = np.where(df['Place of Publication'].str.contains('Oxford'), 'Oxford', df['Place of Publication'])

print("\nDataFrame after cleaning 'Place of Publication':")
print(df[['Place of Publication']].head())


DataFrame after cleaning 'Place of Publication':
           Place of Publication
Identifier                     
206                      London
216                      London
218                      London
472                      London
480                      London


In [19]:
print("\nFinal Cleaned DataFrame:")
print(df.head())


Final Cleaned DataFrame:
           Place of Publication  Date of Publication              Publisher  \
Identifier                                                                    
206                      London               1879.0       S. Tinsley & Co.   
216                      London               1868.0           Virtue & Co.   
218                      London               1869.0  Bradbury, Evans & Co.   
472                      London               1851.0          James Darling   
480                      London               1857.0   Wertheim & Macintosh   

                                                        Title     Author  \
Identifier                                                                 
206                         Walter Forbes. [A novel.] By A. A      A. A.   
216         All for Greed. [A novel. The dedication signed...  A., A. A.   
218         Love the Avenger. By the author of “All for Gr...  A., A. A.   
472         Welsh Sketches, chiefly eccl