In [199]:
import sys
import numpy as np
import pandas as pd

In [200]:
def invalidEntries(data, regex):
    invalid = data.loc[data.astype(str).str.extract(regex, expand=False).isna()]
    NAvalues = invalid.isna().sum()
    
    print(f'{len(invalid)} invalid entries ({NAvalues} NA) in {data.name}.', file=sys.stderr)
    return invalid

In [201]:
df = pd.read_csv('BL-Flickr-Images-Book.csv')
to_drop = ['Edition Statement', 'Corporate Author', 
           'Corporate Contributors','Former owner', 
           'Engraver', 'Contributors', 'Issuance type',
           'Shelfmarks']
df.drop(columns=to_drop, inplace=True)

In [202]:
assert df['Identifier'].is_unique
df = df.set_index('Identifier')

In [203]:
yearRegex = r'^(\d{4})$'
invalidEntries(df['Date of Publication'], yearRegex).sample(10)

1759 invalid entries (181 NA) in Date of Publication.


Identifier
595033            1870, 75
1281331            [1808?]
990977              [1894]
3583974             [1890]
1571190            [1832?]
3100705       1899[-1902?]
1663276           1843, 42
2179454        1748 [1768]
3489130    1855, [1849]-71
3505234             [1876]
Name: Date of Publication, dtype: object

In [204]:
validYears = df['Date of Publication'].str.extract(r'^(\d{4})', expand=False)
# Convert years to numeric and cast to Int64 (which allows NA for integer)
df['Date of Publication'] = pd.to_numeric(validYears).astype(pd.Int64Dtype())
# Ensure all non-NA entries match the expected regex
assert invalidEntries(validYears, r'^(\d{4})$').dropna().empty

971 invalid entries (971 NA) in Date of Publication.


Place of Publication    object
Date of Publication      Int32
Publisher               object
Title                   object
Author                  object
Flickr URL              object
dtype: object

In [51]:
invalidEntries(df['Date of Publication'], r'^(\d{4})$').dropna().empty

True

In [118]:
df['Date of Publication'] = pd.to_numeric(validYears, downcast='integer')

In [121]:
df['Date of Publication'].astype(pd.Int32Dtype())

Identifier
206        1879
216        1868
218        1869
472        1851
480        1857
           ... 
4158088    1838
4158128    1831
4159563    <NA>
4159587    1834
4160339    1834
Name: Date of Publication, Length: 8287, dtype: Int32

In [161]:
list(np.array([1,2,3]).astype(str))

['1', '2', '3']