In [4]:
import pandas as pd
import numpy as np
import re


In [2]:
data = pd.read_csv('oau-books.csv')




In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2954 entries, 0 to 2953
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   No               2954 non-null   int64  
 1   Title            2953 non-null   object 
 2   Author           2380 non-null   object 
 3   Link             2953 non-null   object 
 4   Publisher        2926 non-null   object 
 5   Year             2920 non-null   object 
 6   Pages            1635 non-null   float64
 7   Format           2945 non-null   object 
 8   Literary Form    2945 non-null   object 
 9   Library          2839 non-null   object 
 10  Items Available  2839 non-null   float64
 11  Call Number      2839 non-null   object 
 12  Edition          793 non-null    object 
 13  ISBN             0 non-null      float64
 14  Material Type    2954 non-null   object 
 15  Image URL        2954 non-null   object 
dtypes: float64(3), int64(1), object(12)
memory usage: 369.4+ KB


In [6]:
data = pd.read_csv('oau-books.csv')
def clean_pages(p):
    if pd.isna(p):
        return np.nan
    match = re.search(r'(\d+)', str(p))
    if match:
        return int(match.group(1))
    return np.nan
data['Pages'] = data['Pages'].apply(clean_pages)

# Fill missing Author, Publisher, Year with "Unknown"
data['Author'] = data['Author'].fillna("Unknown")
data['Publisher'] = data['Publisher'].fillna("Unknown")
data['Year'] = data['Year'].fillna("Unknown")

# Droping ISBN column for all NaN
if 'ISBN' in data.columns and data['ISBN'].isnull().all():
    data.drop(columns=['ISBN'], inplace=True)

# Strip whitespace and title-case Title and Author
data['Title'] = data['Title'].str.strip().str.title()
data['Author'] = data['Author'].str.strip().str.title()

# Remove duplicates by Title + Author
data = data.drop_duplicates(subset=['Title', 'Author'])

# Save cleaned file (optional)
data.to_csv('oau-books-cleaned.csv', index=False)

print("Data cleaned! Sample rows:")
print(data.head())


Data cleaned! Sample rows:
   No                                              Title  \
0   1  International Migration In And From Africa :  ...   
1   2  Autering Nollywood :Critical Perspective On Th...   
2   3  In Person--Achebe, Awoonor, And Soyinka At The...   
3   4  Linguistics: An Introduction To Language And C...   
4   5  Africa’S Future, Africa’S Challenge :Early Chi...   

                                              Author  \
0         Edited By Adepoju, Aderanti  Hammar, Tomas   
1                                            Unknown   
2  Achebe, Chinuaawoonor, Kofi, 1935-2013Soyinka,...   
3                                   Akmajian, Adrian   
4  (Edited By) Garcia   Marito (Edited By) Pence ...   

                                                Link  \
0  https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...   
1  https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...   
2  https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...   
3  https://opac.oauife.edu.ng/cgi-bin/koha/opac-d..

In [7]:
data.head ()

Unnamed: 0,No,Title,Author,Link,Publisher,Year,Pages,Format,Literary Form,Library,Items Available,Call Number,Edition,Material Type,Image URL
0,1,International Migration In And From Africa : ...,"Edited By Adepoju, Aderanti Hammar, Tomas",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,"Population, Human Resources and Development i...",1996,198.0,; print,; Not fiction,Hezekiah Oluwasanmi Library(1),1.0,JV8790 .In8.,,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
1,2,Autering Nollywood :Critical Perspective On Th...,Unknown,https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,University Press,2014,457.0,; print,; Not fiction,Arts Library(5),5.0,"PN1998. Au8, ...",,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
2,3,"In Person--Achebe, Awoonor, And Soyinka At The...","Achebe, Chinuaawoonor, Kofi, 1935-2013Soyinka,...",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,"African Studies Program, Institute for Compara...",[1975],,; print,; Not fiction,,,,,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
3,4,Linguistics: An Introduction To Language And C...,"Akmajian, Adrian",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,"PHI Learning,",2010,630.0,; print,; Not fiction,Hezekiah Oluwasanmi Library(3),3.0,"P121 .Ak5, ...",6th ed.,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
4,5,"Africa’S Future, Africa’S Challenge :Early Chi...",(Edited By) Garcia Marito (Edited By) Pence ...,https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,The World Bank,2008,525.0,; print,; Not fiction,Hezekiah Oluwasanmi Library(1),1.0,HQ778.7 .Af8.,,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...


In [8]:
data.tail(100)

Unnamed: 0,No,Title,Author,Link,Publisher,Year,Pages,Format,Literary Form,Library,Items Available,Call Number,Edition,Material Type,Image URL
2854,2855,"Architecture, You And Me : The Diary Of A Deve...","Giedion, S",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,Harvard Unversity Press,2013,221.0,; print,; Not fiction,EDM Library(1),1.0,N7445.G36.,,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
2855,2856,A History Of Spanish Painting/Chandler Rathfon...,"Post, Chandler Rathfon",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,Harvard University Press,2013,354.0,; print,; Not fiction,EDM Library(1),1.0,ND801.P84.,,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
2856,2857,A History Of Spanish Painting : Volumevi-Part ...,"Post, Chandler Rathfon",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,Harvard University Press,2013,334.0,; print,; Not fiction,EDM Library(1),1.0,ND801.P84.,,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
2857,2858,Musical Iconography :A Manaual For Cataloguing...,"Brown,Howard Mayer",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,Harvard University Press,2013,220.0,; print,; Not fiction,Arts Library(1),1.0,ML111.B78.,,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
2858,2859,American Architecure And Other Writings/Montgo...,"Schuyler, Montgomery",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,Harvard University Press,2013,664.0,; print,; Not fiction,EDM Library(1),1.0,NA710.Sch1.,,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2949,2950,Professional Mathematics For Polytechnics For ...,"Dass, H.K",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,CBS Publishers,2017,348.0,; print,; Not fiction,Hezekiah Oluwasanmi Library(1),1.0,QA39.D26.,,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
2950,2951,Biotechnology And Insect Pest Managementdaxton...,"Ware, Daxton",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,ED-Tech Press,2022,292.0,; print,; Not fiction,Hezekiah Oluwasanmi Library(1),1.0,SB933.W22.,,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
2951,2952,"Building Constructionxvi, 938P.: Illit Include...","Jha, Janardan",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,Khanna Publishers,2017,938.0,; print,; Not fiction,EDM Library(3),3.0,"TH145.J55, ...",,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
2952,2953,Basic Of Biotechnology In Agricultureethan Dic...,"Dickinson, Ethan",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,White Press Academic,2022,291.0,; print,; Not fiction,Hezekiah Oluwasanmi Library(1),1.0,S494.5.D56.,,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2917 entries, 0 to 2953
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   No               2917 non-null   int64  
 1   Title            2916 non-null   object 
 2   Author           2917 non-null   object 
 3   Link             2916 non-null   object 
 4   Publisher        2917 non-null   object 
 5   Year             2917 non-null   object 
 6   Pages            1613 non-null   float64
 7   Format           2908 non-null   object 
 8   Literary Form    2908 non-null   object 
 9   Library          2816 non-null   object 
 10  Items Available  2816 non-null   float64
 11  Call Number      2816 non-null   object 
 12  Edition          778 non-null    object 
 13  Material Type    2917 non-null   object 
 14  Image URL        2917 non-null   object 
dtypes: float64(2), int64(1), object(12)
memory usage: 364.6+ KB


In [12]:
data = pd.read_csv("oau-books-cleaned.csv")

In [13]:
data.head()

Unnamed: 0,No,Title,Author,Link,Publisher,Year,Pages,Format,Literary Form,Library,Items Available,Call Number,Edition,Material Type,Image URL
0,1,International Migration In And From Africa : ...,"Edited By Adepoju, Aderanti Hammar, Tomas",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,"Population, Human Resources and Development i...",1996,198.0,; print,; Not fiction,Hezekiah Oluwasanmi Library(1),1.0,JV8790 .In8.,,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
1,2,Autering Nollywood :Critical Perspective On Th...,Unknown,https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,University Press,2014,457.0,; print,; Not fiction,Arts Library(5),5.0,"PN1998. Au8, ...",,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
2,3,"In Person--Achebe, Awoonor, And Soyinka At The...","Achebe, Chinuaawoonor, Kofi, 1935-2013Soyinka,...",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,"African Studies Program, Institute for Compara...",[1975],,; print,; Not fiction,,,,,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
3,4,Linguistics: An Introduction To Language And C...,"Akmajian, Adrian",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,"PHI Learning,",2010,630.0,; print,; Not fiction,Hezekiah Oluwasanmi Library(3),3.0,"P121 .Ak5, ...",6th ed.,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
4,5,"Africa’S Future, Africa’S Challenge :Early Chi...",(Edited By) Garcia Marito (Edited By) Pence ...,https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,The World Bank,2008,525.0,; print,; Not fiction,Hezekiah Oluwasanmi Library(1),1.0,HQ778.7 .Af8.,,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2917 entries, 0 to 2916
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   No               2917 non-null   int64  
 1   Title            2916 non-null   object 
 2   Author           2917 non-null   object 
 3   Link             2916 non-null   object 
 4   Publisher        2917 non-null   object 
 5   Year             2917 non-null   object 
 6   Pages            1613 non-null   float64
 7   Format           2908 non-null   object 
 8   Literary Form    2908 non-null   object 
 9   Library          2816 non-null   object 
 10  Items Available  2816 non-null   float64
 11  Call Number      2816 non-null   object 
 12  Edition          778 non-null    object 
 13  Material Type    2917 non-null   object 
 14  Image URL        2917 non-null   object 
dtypes: float64(2), int64(1), object(12)
memory usage: 342.0+ KB


In [17]:
# Drop rows with missing Title or Author
data = data.dropna(subset=['Title', 'Author'])


In [18]:
# Remove exact duplicate books based on Title and Author
data = data.drop_duplicates(subset=['Title', 'Author'])


In [19]:
# Fill missing categorical columns with "Unknown"
for col in ['Format', 'Literary Form', 'Library', 'Edition']:
    data[col] = data[col].fillna('Unknown')


In [21]:
data = data.reset_index(drop=True)

# Print info and head to confirm cleaning
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2916 entries, 0 to 2915
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   No               2916 non-null   int64  
 1   Title            2916 non-null   object 
 2   Author           2916 non-null   object 
 3   Link             2916 non-null   object 
 4   Publisher        2916 non-null   object 
 5   Year             2916 non-null   object 
 6   Pages            1613 non-null   float64
 7   Format           2916 non-null   object 
 8   Literary Form    2916 non-null   object 
 9   Library          2916 non-null   object 
 10  Items Available  2816 non-null   float64
 11  Call Number      2816 non-null   object 
 12  Edition          2916 non-null   object 
 13  Material Type    2916 non-null   object 
 14  Image URL        2916 non-null   object 
dtypes: float64(2), int64(1), object(12)
memory usage: 341.8+ KB
None
   No                                    

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2916 entries, 0 to 2915
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   No               2916 non-null   int64  
 1   Title            2916 non-null   object 
 2   Author           2916 non-null   object 
 3   Link             2916 non-null   object 
 4   Publisher        2916 non-null   object 
 5   Year             2916 non-null   object 
 6   Pages            1613 non-null   float64
 7   Format           2916 non-null   object 
 8   Literary Form    2916 non-null   object 
 9   Library          2916 non-null   object 
 10  Items Available  2816 non-null   float64
 11  Call Number      2816 non-null   object 
 12  Edition          2916 non-null   object 
 13  Material Type    2916 non-null   object 
 14  Image URL        2916 non-null   object 
dtypes: float64(2), int64(1), object(12)
memory usage: 341.8+ KB


In [23]:
print(data['Pages'].isnull().sum())  
data['Pages'] = data['Pages'].fillna(0)
print(data['Pages'].isnull().sum()) 


1303
0


In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2916 entries, 0 to 2915
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   No               2916 non-null   int64  
 1   Title            2916 non-null   object 
 2   Author           2916 non-null   object 
 3   Link             2916 non-null   object 
 4   Publisher        2916 non-null   object 
 5   Year             2916 non-null   object 
 6   Pages            2916 non-null   float64
 7   Format           2916 non-null   object 
 8   Literary Form    2916 non-null   object 
 9   Library          2916 non-null   object 
 10  Items Available  2816 non-null   float64
 11  Call Number      2816 non-null   object 
 12  Edition          2916 non-null   object 
 13  Material Type    2916 non-null   object 
 14  Image URL        2916 non-null   object 
dtypes: float64(2), int64(1), object(12)
memory usage: 341.8+ KB


In [25]:
data['Items Available'] = data['Items Available'].fillna(0)
data['Call Number'] = data['Call Number'].fillna('Unknown')


In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2916 entries, 0 to 2915
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   No               2916 non-null   int64  
 1   Title            2916 non-null   object 
 2   Author           2916 non-null   object 
 3   Link             2916 non-null   object 
 4   Publisher        2916 non-null   object 
 5   Year             2916 non-null   object 
 6   Pages            2916 non-null   float64
 7   Format           2916 non-null   object 
 8   Literary Form    2916 non-null   object 
 9   Library          2916 non-null   object 
 10  Items Available  2916 non-null   float64
 11  Call Number      2916 non-null   object 
 12  Edition          2916 non-null   object 
 13  Material Type    2916 non-null   object 
 14  Image URL        2916 non-null   object 
dtypes: float64(2), int64(1), object(12)
memory usage: 341.8+ KB


In [28]:
data.to_csv('oau_books_cleaned2.csv', index=False, encoding='utf-8')


In [29]:
data.head(20)

Unnamed: 0,No,Title,Author,Link,Publisher,Year,Pages,Format,Literary Form,Library,Items Available,Call Number,Edition,Material Type,Image URL
0,1,International Migration In And From Africa : ...,"Edited By Adepoju, Aderanti Hammar, Tomas",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,"Population, Human Resources and Development i...",1996,198.0,; print,; Not fiction,Hezekiah Oluwasanmi Library(1),1.0,JV8790 .In8.,Unknown,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
1,2,Autering Nollywood :Critical Perspective On Th...,Unknown,https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,University Press,2014,457.0,; print,; Not fiction,Arts Library(5),5.0,"PN1998. Au8, ...",Unknown,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
2,3,"In Person--Achebe, Awoonor, And Soyinka At The...","Achebe, Chinuaawoonor, Kofi, 1935-2013Soyinka,...",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,"African Studies Program, Institute for Compara...",[1975],0.0,; print,; Not fiction,Unknown,0.0,Unknown,Unknown,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
3,4,Linguistics: An Introduction To Language And C...,"Akmajian, Adrian",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,"PHI Learning,",2010,630.0,; print,; Not fiction,Hezekiah Oluwasanmi Library(3),3.0,"P121 .Ak5, ...",6th ed.,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
4,5,"Africa’S Future, Africa’S Challenge :Early Chi...",(Edited By) Garcia Marito (Edited By) Pence ...,https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,The World Bank,2008,525.0,; print,; Not fiction,Hezekiah Oluwasanmi Library(1),1.0,HQ778.7 .Af8.,Unknown,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
5,6,"Shakespeare In & Out Of Africavol. Editor, Jan...","Plastow, Jane Banham, Martin",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,James Curry,2013,194.0,; print,; Not fiction,Hezekiah Oluwasanmi Library(2),2.0,"PR3109.A35Sh53, ...",Unknown,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
6,7,The Encylopedia Americana: Complete In Thirty...,Unknown,https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,Americana Corporation,1952,806.0,; print,; Not fiction,Hezekiah Oluwasanmi Library(45),45.0,"AE5 .En1, ...",Unknown,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
7,8,Settled Strangers Asian Business Elites In Eas...,"Oonk, Gijsbert",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,SAGE,2013,270.0,; print,; Not fiction,Education Library(1),1.0,DT16 .E17On4.,Unknown,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
8,9,New Architecture On Indigenous Landsjoy Monice...,"Malnar, Joy Monice",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,University of Minnesota Press,2013,0.0,; print,; Not fiction,Hezekiah Oluwasanmi Library(2),2.0,"NA 2543 .A58M31, ...",Unknown,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
9,10,Achieving Your Assessor And Quality Assurance ...,"Gravells, Ann",https://opac.oauife.edu.ng/cgi-bin/koha/opac-d...,"Sage Publications,",2014,274.0,; print,; Not fiction,Hezekiah Oluwasanmi Library(2),2.0,"LC5225.A75G78, ...",2nd ed.,Text,https://opac.oauife.edu.ng/opac-tmpl/lib/famfa...
