In [2]:
import pandas as pd
import numpy as np
import requests
import spacy
import re
from collections import Counter

In [3]:
import nltk
from nltk.stem import PorterStemmer
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk import bigrams, word_tokenize

In [29]:
books = pd.read_csv('../data/cleaned_books.zip')

In [4]:
books.head()

Unnamed: 0,Title,image,authors,categories,publisher,description
0,Dr. Seuss: American Icon,http://books.google.com/books/content?id=IjvHQ...,['Philip Nel'],['Biography & Autobiography'],A&C Black,Philip Nel takes a fascinating look into the k...
1,Wonderful Worship in Smaller Churches,http://books.google.com/books/content?id=2tsDA...,['David R. Ray'],['Religion'],,This resource includes twelve principles in un...
2,Whispers of the Wicked Saints,http://books.google.com/books/content?id=aRSIg...,['Veronica Haddon'],['Fiction'],iUniverse,Julia Thomas finds her life spinning out of co...
3,The Church of Christ: A Biblical Ecclesiology ...,http://books.google.com/books/content?id=kVqRa...,['Everett Ferguson'],['Religion'],Wm. B. Eerdmans Publishing,In The Church of Christ: A Biblical Ecclesiolo...
4,Saint Hyacinth of Poland,http://books.google.com/books/content?id=lmLqA...,['Mary Fabyan Windeatt'],['Biography & Autobiography'],Tan Books & Pub,The story for children 10 and up of St. Hyacin...


In [33]:
books['tags'] = books['description']

In [34]:
#CONVERT ALL TO ONE LINER
books['tags'] = books['tags'].replace('\n', ' ')

In [35]:
#REMOVE SPECIAL CHARACTERS, PUNCTUATIONS, CONVERT TO LOWER, SPACE REDUCTING
books["tags"] = books["tags"].str.replace("[^a-zA-Z0-9 ]", " ", regex=True)
books["tags"] = books["tags"].str.lower()
books["tags"] = books["tags"].str.replace("\s+", " ", regex=True)

In [36]:
books.head()

Unnamed: 0,Title,image,authors,categories,publisher,description,tags
0,Dr. Seuss: American Icon,http://books.google.com/books/content?id=IjvHQ...,['Philip Nel'],['Biography & Autobiography'],A&C Black,Philip Nel takes a fascinating look into the k...,philip nel takes a fascinating look into the k...
1,Wonderful Worship in Smaller Churches,http://books.google.com/books/content?id=2tsDA...,['David R. Ray'],['Religion'],,This resource includes twelve principles in un...,this resource includes twelve principles in un...
2,Whispers of the Wicked Saints,http://books.google.com/books/content?id=aRSIg...,['Veronica Haddon'],['Fiction'],iUniverse,Julia Thomas finds her life spinning out of co...,julia thomas finds her life spinning out of co...
3,The Church of Christ: A Biblical Ecclesiology ...,http://books.google.com/books/content?id=kVqRa...,['Everett Ferguson'],['Religion'],Wm. B. Eerdmans Publishing,In The Church of Christ: A Biblical Ecclesiolo...,in the church of christ a biblical ecclesiolog...
4,Saint Hyacinth of Poland,http://books.google.com/books/content?id=lmLqA...,['Mary Fabyan Windeatt'],['Biography & Autobiography'],Tan Books & Pub,The story for children 10 and up of St. Hyacin...,the story for children 10 and up of st hyacint...


In [37]:
books = books.drop(columns=['image','publisher','description'],axis =1)

In [38]:
#TOKENIZE the tags column
books['tags'] = books['tags'].apply(lambda x: x.split())

In [39]:
books.head()

Unnamed: 0,Title,authors,categories,tags
0,Dr. Seuss: American Icon,['Philip Nel'],['Biography & Autobiography'],"[philip, nel, takes, a, fascinating, look, int..."
1,Wonderful Worship in Smaller Churches,['David R. Ray'],['Religion'],"[this, resource, includes, twelve, principles,..."
2,Whispers of the Wicked Saints,['Veronica Haddon'],['Fiction'],"[julia, thomas, finds, her, life, spinning, ou..."
3,The Church of Christ: A Biblical Ecclesiology ...,['Everett Ferguson'],['Religion'],"[in, the, church, of, christ, a, biblical, ecc..."
4,Saint Hyacinth of Poland,['Mary Fabyan Windeatt'],['Biography & Autobiography'],"[the, story, for, children, 10, and, up, of, s..."


In [40]:
#REMOVE STOPWORDS
#stopwords
stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
stopwords = list(stopwords_list.decode().splitlines()) 


In [41]:
def remove_stopwords(tokens):
    return [token for token in tokens if token not in stopwords]

# Apply the function to the 'tokens' column
books['tags'] = books['tags'].apply(remove_stopwords)

In [42]:
books.head()

Unnamed: 0,Title,authors,categories,tags
0,Dr. Seuss: American Icon,['Philip Nel'],['Biography & Autobiography'],"[philip, nel, takes, fascinating, key, aspects..."
1,Wonderful Worship in Smaller Churches,['David R. Ray'],['Religion'],"[resource, includes, principles, understanding..."
2,Whispers of the Wicked Saints,['Veronica Haddon'],['Fiction'],"[julia, thomas, finds, life, spinning, control..."
3,The Church of Christ: A Biblical Ecclesiology ...,['Everett Ferguson'],['Religion'],"[church, christ, biblical, ecclesiology, today..."
4,Saint Hyacinth of Poland,['Mary Fabyan Windeatt'],['Biography & Autobiography'],"[story, children, 10, hyacinth, dominican, pla..."


In [43]:
#REMOVE NUMBERIC TOKENS
books['tags'] = books['tags'].apply(lambda tokens: [token for token in tokens if not token.isdigit()])

In [29]:
#REMOVE ENGLISH FIRST NAMES
nlp = spacy.load("en_core_web_sm")
def remove_names(tokens):
    doc = nlp(' '.join(tokens))
    return [token.text for token in doc if token.ent_type_ != 'PERSON']

# Apply the function to the 'tags' column
books['tags_without_names'] = books['tags'].apply(remove_names)

In [6]:
books.head()

Unnamed: 0,Title,authors,categories,tags,tags_without_names
0,Dr. Seuss: American Icon,['Philip Nel'],['Biography & Autobiography'],"['philip', 'nel', 'takes', 'fascinating', 'key...","['takes', 'fascinating', 'key', 'aspects', 'se..."
1,Wonderful Worship in Smaller Churches,['David R. Ray'],['Religion'],"['resource', 'includes', 'principles', 'unders...","['resource', 'includes', 'principles', 'unders..."
2,Whispers of the Wicked Saints,['Veronica Haddon'],['Fiction'],"['julia', 'thomas', 'finds', 'life', 'spinning...","['finds', 'life', 'spinning', 'control', 'deat..."
3,The Church of Christ: A Biblical Ecclesiology ...,['Everett Ferguson'],['Religion'],"['church', 'christ', 'biblical', 'ecclesiology...","['church', 'christ', 'biblical', 'ecclesiology..."
4,Saint Hyacinth of Poland,['Mary Fabyan Windeatt'],['Biography & Autobiography'],"['story', 'children', 'hyacinth', 'dominican',...","['story', 'children', 'hyacinth', 'dominican',..."


In [7]:
books['tags'] = books['tags_without_names']

In [8]:
books = books.drop('tags_without_names', axis =1)

In [9]:
books.head()

Unnamed: 0,Title,authors,categories,tags
0,Dr. Seuss: American Icon,['Philip Nel'],['Biography & Autobiography'],"['takes', 'fascinating', 'key', 'aspects', 'se..."
1,Wonderful Worship in Smaller Churches,['David R. Ray'],['Religion'],"['resource', 'includes', 'principles', 'unders..."
2,Whispers of the Wicked Saints,['Veronica Haddon'],['Fiction'],"['finds', 'life', 'spinning', 'control', 'deat..."
3,The Church of Christ: A Biblical Ecclesiology ...,['Everett Ferguson'],['Religion'],"['church', 'christ', 'biblical', 'ecclesiology..."
4,Saint Hyacinth of Poland,['Mary Fabyan Windeatt'],['Biography & Autobiography'],"['story', 'children', 'hyacinth', 'dominican',..."


In [10]:
# Function to convert to string, handling empty strings
import ast
def tokens_string(s):
    try:
        lst = ast.literal_eval(s)
        if isinstance(lst, list) and len(lst) > 0:
            return ' '.join(lst)
        else:
            return ''
    except (SyntaxError, ValueError):
        return ''

In [11]:
books['tags'] = books['tags'].apply(lambda x: tokens_string(x))

In [12]:
books.head()

Unnamed: 0,Title,authors,categories,tags
0,Dr. Seuss: American Icon,['Philip Nel'],['Biography & Autobiography'],takes fascinating key aspects seuss career poe...
1,Wonderful Worship in Smaller Churches,['David R. Ray'],['Religion'],resource includes principles understanding sma...
2,Whispers of the Wicked Saints,['Veronica Haddon'],['Fiction'],finds life spinning control death husband turn...
3,The Church of Christ: A Biblical Ecclesiology ...,['Everett Ferguson'],['Religion'],church christ biblical ecclesiology today resp...
4,Saint Hyacinth of Poland,['Mary Fabyan Windeatt'],['Biography & Autobiography'],story children hyacinth dominican planted fait...


In [26]:
books['stemmed_tags'] = books['tags']

In [29]:
#STEMMING
# Initialize the Porter Stemmer
stemmer = PorterStemmer()
# Function to apply stemming to a single text
def stem_text(text):
    return ' '.join(stemmer.stem(word) for word in text.split())
# Create a tqdm progress bar to track progress
tqdm.pandas()
# Apply stemming to a batch of 1000 rows
start_index = 0
batch_size = 10000
while start_index < len(books):
    end_index = start_index + batch_size
    books['stemmed_tags'].iloc[start_index:end_index] = books['tags'].iloc[start_index:end_index].progress_apply(stem_text)
    start_index = end_index

100%|██████████| 10000/10000 [00:11<00:00, 835.19it/s]
100%|██████████| 10000/10000 [00:11<00:00, 856.29it/s]
100%|██████████| 10000/10000 [00:11<00:00, 866.59it/s]
100%|██████████| 10000/10000 [00:11<00:00, 869.35it/s]
100%|██████████| 10000/10000 [00:13<00:00, 751.17it/s]
100%|██████████| 10000/10000 [00:13<00:00, 751.39it/s]
100%|██████████| 10000/10000 [00:12<00:00, 832.11it/s]
100%|██████████| 10000/10000 [00:11<00:00, 838.61it/s]
100%|██████████| 10000/10000 [00:12<00:00, 798.25it/s]
100%|██████████| 10000/10000 [00:12<00:00, 807.66it/s]
100%|██████████| 10000/10000 [00:12<00:00, 801.70it/s]
100%|██████████| 10000/10000 [00:12<00:00, 780.95it/s]
100%|██████████| 10000/10000 [00:12<00:00, 796.41it/s]
100%|██████████| 10000/10000 [00:20<00:00, 495.72it/s]
100%|██████████| 3505/3505 [00:06<00:00, 573.29it/s]


In [15]:
books.head()

Unnamed: 0,Title,authors,categories,tags,stemmed_tags
0,Dr. Seuss: American Icon,['Philip Nel'],['Biography & Autobiography'],takes fascinating key aspects seuss career poe...,take fascin key aspect seuss career poetri pol...
1,Wonderful Worship in Smaller Churches,['David R. Ray'],['Religion'],resource includes principles understanding sma...,resourc includ principl understand small churc...
2,Whispers of the Wicked Saints,['Veronica Haddon'],['Fiction'],finds life spinning control death husband turn...,find life spin control death husband turn mini...
3,The Church of Christ: A Biblical Ecclesiology ...,['Everett Ferguson'],['Religion'],church christ biblical ecclesiology today resp...,church christ biblic ecclesiolog today respect...
4,Saint Hyacinth of Poland,['Mary Fabyan Windeatt'],['Biography & Autobiography'],story children hyacinth dominican planted fait...,stori children hyacinth dominican plant faith ...


In [6]:
#LEMMATIZE
# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

books['text_lemma'] = np.nan
lemma_text_list = []

batch_size = 100

num_batches = (len(books) + batch_size - 1) // batch_size

#usin tqdm to create a progress bar
for i in tqdm(range(num_batches), desc="Processing batches"):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    batch_docs = list(nlp.pipe(books["tags"].iloc[start_idx:end_idx],n_process=-1))

    for doc in tqdm(batch_docs, desc="Lemmatizing", leave=False):
        lemma_text_list.append(" ".join(token.lemma_ for token in doc))

books["tags_lemma"] = lemma_text_list

Processing batches: 100%|██████████| 1436/1436 [36:26<00:00,  1.52s/it]


In [19]:
books.head()

Unnamed: 0,Title,authors,categories,tags,stemmed_tags,tags_lemma
0,Dr. Seuss: American Icon,['Philip Nel'],['Biography & Autobiography'],takes fascinating key aspects seuss career poe...,take fascin key aspect seuss career poetri pol...,take fascinating key aspect seuss career poetr...
1,Wonderful Worship in Smaller Churches,['David R. Ray'],['Religion'],resource includes principles understanding sma...,resourc includ principl understand small churc...,resource include principle understand small ch...
2,Whispers of the Wicked Saints,['Veronica Haddon'],['Fiction'],finds life spinning control death husband turn...,find life spin control death husband turn mini...,find life spin control death husband turn mini...
3,The Church of Christ: A Biblical Ecclesiology ...,['Everett Ferguson'],['Religion'],church christ biblical ecclesiology today resp...,church christ biblic ecclesiolog today respect...,church christ biblical ecclesiology today resp...
4,Saint Hyacinth of Poland,['Mary Fabyan Windeatt'],['Biography & Autobiography'],story children hyacinth dominican planted fait...,stori children hyacinth dominican plant faith ...,story child hyacinth dominican plant faith pol...


In [20]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143505 entries, 0 to 143504
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Title         143505 non-null  object
 1   authors       141755 non-null  object
 2   categories    137888 non-null  object
 3   tags          143505 non-null  object
 4   stemmed_tags  143390 non-null  object
 5   tags_lemma    143390 non-null  object
dtypes: object(6)
memory usage: 6.6+ MB


In [21]:
books=books[~books['tags_lemma'].isnull()]

In [22]:
books['tokenized'] = books['tags_lemma'].apply(lambda x: x.split())

In [23]:
#REMOVE RARE TOKENS

# Calculate word frequencies
all_tokens = [token for tokens_list in books['tokenized'] for token in tokens_list]
word_counts = Counter(all_tokens)

# Print rare words
threshold = 5
rare_tokens = {token for token, count in word_counts.items() if count < threshold}
print("Rare Words:", rare_tokens)

# Remove rare words (e.g., occurring less than 10 times)
common_tokens = {token for token, count in word_counts.items() if count >= threshold}

# Update the 'tokens' column with only common tokens
books['tokenized'] = books['tokenized'].apply(lambda tokens_list: [token for token in tokens_list if token in common_tokens])




In [93]:
len(rare_tokens)

117022

In [95]:
len(all_tokens)

7130770

In [24]:
#REMOVE SOME OF THE TOO FREQUENT TOKENS
#Finding the frequency of the tokens in the corpus
token_frequency = books['tokenized'].explode().value_counts()

In [25]:
token_frequency[1:100]

tokenized
life       41687
include    36712
work       32874
story      28222
year       27238
           ...  
turn        7330
process     7323
english     7308
play        7217
form        7143
Name: count, Length: 99, dtype: int64

In [26]:
#selecting the tokens (these are the words common across book descriptions, indiscriminate of the genre of the book)
unwanted_tokens = ['book','include','author','year','time','reader','edition','offer','great','day','text','cover','read','understand','feature','chapter','volume','bring','provide','publish']

In [27]:
#Removing those tokens from the corpus
books['tokenized'] = books['tokenized'].apply(lambda tokens: [token for token in tokens if token not in unwanted_tokens])

In [47]:
#Processing authors and category
books['categories'].fillna(value=' ', inplace=True)
books['authors'].fillna(value = ' ',inplace=True)

In [48]:

def collapse(L):
    L1 = ''
    for i in L:
        L1 = L1+ i.replace(" ","")
    return L1


In [49]:
books['authors'] = books['authors'].apply(lambda x: collapse(x))
books['categories'] = books['categories'].apply(lambda x: collapse(x))

In [131]:
# Function to convert to string, handling empty strings
import ast
def tokens_string(s):
    try:
        lst = ast.literal_eval(s)
        if isinstance(lst, list) and len(lst) > 0:
            return ' '.join(lst)
        else:
            return ''
    except (SyntaxError, ValueError):
        return ''

In [137]:
books['cat'] = books['categories'].apply(lambda x: tokens_string(x))
books['auth'] = books['authors'].apply(lambda x: tokens_string(x))
books['tok'] = books['tokenized'].apply(lambda tokens: ' '.join(tokens))

In [139]:
books['tags'] = books['cat'] +' ' +books['auth'] + ' ' + books['tok']

In [61]:
books['tags_tokens'] = books['tags'].apply(lambda x: x.split())

In [69]:
books.head()

Unnamed: 0,Title,authors,categories,tags,stemmed_tags,tags_lemma,tokenized,auth,cat,tok,tags_tokens
0,Dr. Seuss: American Icon,[Philip Nel],[Biography & Autobiography],Biography&Autobiography PhilipNel take fascina...,take fascin key aspect seuss career poetri pol...,take fascinating key aspect seuss career poetr...,"[take, fascinating, key, aspect, seuss, career...",PhilipNel,Biography&Autobiography,take fascinating key aspect seuss career poetr...,"[Biography&Autobiography, PhilipNel, take, fas..."
1,Wonderful Worship in Smaller Churches,[David R. Ray],[Religion],Religion DavidR.Ray resource principle small c...,resourc includ principl understand small churc...,resource include principle understand small ch...,"[resource, principle, small, church, worship, ...",DavidR.Ray,Religion,resource principle small church worship practi...,"[Religion, DavidR.Ray, resource, principle, sm..."
2,Whispers of the Wicked Saints,[Veronica Haddon],[Fiction],Fiction VeronicaHaddon find life spin control ...,find life spin control death husband turn mini...,find life spin control death husband turn mini...,"[find, life, spin, control, death, husband, tu...",VeronicaHaddon,Fiction,find life spin control death husband turn mini...,"[Fiction, VeronicaHaddon, find, life, spin, co..."
3,The Church of Christ: A Biblical Ecclesiology ...,[Everett Ferguson],[Religion],Religion EverettFerguson church christ biblica...,church christ biblic ecclesiolog today respect...,church christ biblical ecclesiology today resp...,"[church, christ, biblical, ecclesiology, today...",EverettFerguson,Religion,church christ biblical ecclesiology today resp...,"[Religion, EverettFerguson, church, christ, bi..."
4,Saint Hyacinth of Poland,[Mary Fabyan Windeatt],[Biography & Autobiography],Biography&Autobiography MaryFabyanWindeatt sto...,stori children hyacinth dominican plant faith ...,story child hyacinth dominican plant faith pol...,"[story, child, hyacinth, dominican, plant, fai...",MaryFabyanWindeatt,Biography&Autobiography,story child hyacinth dominican plant faith pol...,"[Biography&Autobiography, MaryFabyanWindeatt, ..."


In [63]:
features = ['Title','tags','tags_tokens']

In [64]:
new = books[features]

In [65]:
new.head()

Unnamed: 0,Title,tags,tags_tokens
0,Dr. Seuss: American Icon,Biography&Autobiography PhilipNel take fascina...,"[Biography&Autobiography, PhilipNel, take, fas..."
1,Wonderful Worship in Smaller Churches,Religion DavidR.Ray resource principle small c...,"[Religion, DavidR.Ray, resource, principle, sm..."
2,Whispers of the Wicked Saints,Fiction VeronicaHaddon find life spin control ...,"[Fiction, VeronicaHaddon, find, life, spin, co..."
3,The Church of Christ: A Biblical Ecclesiology ...,Religion EverettFerguson church christ biblica...,"[Religion, EverettFerguson, church, christ, bi..."
4,Saint Hyacinth of Poland,Biography&Autobiography MaryFabyanWindeatt sto...,"[Biography&Autobiography, MaryFabyanWindeatt, ..."


In [66]:
new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 143390 entries, 0 to 143504
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Title        143390 non-null  object
 1   tags         143390 non-null  object
 2   tags_tokens  143390 non-null  object
dtypes: object(3)
memory usage: 4.4+ MB


In [None]:
# new.to_csv('../data/lemma_preprocessed.csv',index=False)

*END*