### Data Preprocessing

In [1]:
import os 
import pandas as pd

DATASET_DIR = 'dataset'

### 1. Loading Dataset

In [2]:
try:
    df = pd.read_csv(os.path.join(DATASET_DIR, 'books.csv'))
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Dataset not found. Please ensure the dataset is in the correct directory.")
    raise

print(f"\nTotal Number of books (rows): {len(df)}")
print("-"*40)

Dataset loaded successfully.

Total Number of books (rows): 6810
----------------------------------------


### 2. Selecting Features

In [3]:
features = ['title', 'authors', 'categories', 'description']
df_features = df[features].copy()

print("\nSelected Features:")
print(df_features.head(3))
print("-"*170)


Selected Features:
          title                          authors  \
0        Gilead               Marilynne Robinson   
1  Spider's Web  Charles Osborne;Agatha Christie   
2  The One Tree             Stephen R. Donaldson   

                      categories  \
0                        Fiction   
1  Detective and mystery stories   
2               American fiction   

                                         description  
0  A NOVEL THAT READERS and critics have been eag...  
1  A new 'Christie for Christmas' -- a full-lengt...  
2  Volume Two of Stephen Donaldson's acclaimed se...  
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------


### 3. Handling Missing Data 

In [4]:
print("\nMissing Data in Each Feature:")
print(df_features.isnull().sum())
print("-"*50)

# fill missing values with empty strings
for feature in features:
    df_features[feature] = df_features[feature].fillna('')


print("\nHandled Data:")
print(df_features.isnull().sum())
print("-"*170)


Missing Data in Each Feature:
title            0
authors         72
categories      99
description    262
dtype: int64
--------------------------------------------------

Handled Data:
title          0
authors        0
categories     0
description    0
dtype: int64
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------


### 4. Create a combined text feature

In [5]:
def combine_text_weighted(row):
    """
    Combine text features and weights them by reapeating the text,
    giving authors categories more influcence in the TF-IDF score.
    """

    categories_weighted = (str(row['categories']) + ' ') * 4 # repeat categories 4 times
    authors_weighted = (str(row['authors']) + ' ') * 2     # repeat authors 2 times
    title_weighted = (str(row['title']) + ' ') * 2         # repeat title 2 times
    description = str(row['description'])                  # description repeats only once

    return (title_weighted + authors_weighted + categories_weighted + description).lower().strip()

print("\nCombined Text Features Successfully.")

# apply the function to create a new combined feature
df_features['combined_features'] = df_features.apply(combine_text_weighted, axis=1)

print("\nExample of the combined text feature (only first book): ")
print(df_features['combined_features'].iloc[0][:500]+ "....") # print first 500 characters
print("-"*170)    


Combined Text Features Successfully.

Example of the combined text feature (only first book): 
gilead gilead marilynne robinson marilynne robinson fiction fiction fiction fiction a novel that readers and critics have been eagerly anticipating for over a decade, gilead is an astonishingly imagined story of remarkable lives. john ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. it’s 1956 in gilead, iowa, towards the end of the reverend ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will neve....
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------


### 5. Save the cleaned Data

In [6]:
output_filename = os.path.join(DATASET_DIR, 'books_cleaned.csv')
if 'isbn13' in df.columns:
    df_features['isbn13'] = df['isbn13'] # add isbn13
    df_features[['isbn13', 'title', 'combined_features']].to_csv(output_filename, index=False)
    print(f"\nCleaned data saved to {output_filename}")
else:
    print("Error: 'isbn13' column not found in the original dataset.")


Cleaned data saved to dataset/books_cleaned.csv
