# Book Recommender System 
## Data Processing

In [53]:
import pandas as pd
import os
import numpy as np
import yaml


## Read Data from Data folder

In [54]:
def load_config(config_path):
    """
    Load the configuration from a YAML file.
    
    Args:
        config_path (str): Path to the YAML configuration file.
        
    Returns:
        dict: Configuration parameters as a dictionary.
    """
    with open(config_path, 'r') as file:
        config = yaml.safe_load(file)
    return config
def read_data(file_path, config):
    """
    Reads a CSV file and returns a DataFrame.
    """
    try:
        data_dict = {}
        for key, value in config['filename_dictionary'].items():
            if value.endswith('.csv'):
                # Use on_bad_lines='skip' for pandas >= 1.3.0, else use error_bad_lines=False for older versions
                try:
                    data_dict[key] = pd.read_csv(
                        os.path.join(file_path, value),
                        encoding=config['encoding'],
                        sep = str(config['seperator']),
                        on_bad_lines='skip'  # For pandas >= 1.3.0
                    )
                except TypeError:
                    # Fallback for older pandas versions
                    data_dict[key] = pd.read_csv(
                        os.path.join(file_path, value),
                        encoding=config['encoding'],
                        sep=str(config['seperator']),
                        error_bad_lines=False  # Deprecated in newer pandas
                    )
            elif value.endswith('.xlsx'):
                data_dict[key] = pd.read_excel(os.path.join(file_path, value))
            else:
                print(f"Unsupported file format for {value}")
        return data_dict
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return None

In [55]:
config = load_config(os.path.join(os.getcwd(), "..","config",'config.yaml'))
data_dict = read_data(os.path.join(os.getcwd(), "..","Data"), config)

  data_dict[key] = pd.read_csv(


In [56]:
data_dict.keys()

dict_keys(['ratings', 'users', 'books'])

In [58]:
rename_dictionary = {
    "ratings": {
        "User-ID": "user_id",
        "ISBN": "isbn",
        "Book-Rating": "rating"
    },
    "users": {
        "User-ID": "user_id",
        "Location": "location",
        "Age": "age"
    },
    "books": {
        "ISBN": "isbn",
        "Book-Title": "title",
        "Book-Author": "author",
        "Year-Of-Publication": "year_of_publication",
        "Publisher": "publisher",
        "Image-URL-S": "image_url_s",
        "Image-URL-M": "image_url_m",
        "Image-URL-L": "image_url_l"
    }
}


## Renaming dictionary

In [59]:
for key in data_dict.keys():
    if isinstance(data_dict[key], pd.DataFrame):
        print(f"DataFrame for {key} has shape: {data_dict[key].shape}")
        data_dict[key].rename(columns = rename_dictionary[key], inplace=True)
    else:
        print(f"{key} is not a DataFrame, it is of type {type(data_dict[key])}")

DataFrame for ratings has shape: (1149780, 3)
DataFrame for users has shape: (278858, 3)
DataFrame for books has shape: (271360, 8)
