In [6]:
import pandas as pd
import os

# Define file paths
DATASET_FOLDER = r"C:\Users\putva\OneDrive\Desktop\Book Recommader\Dataset"
RAW_DATA_FOLDER = os.path.join(DATASET_FOLDER, "raw")
PROCESSED_DATA_FOLDER = os.path.join(DATASET_FOLDER, "processed")

# Ensure processed folder exists
os.makedirs(PROCESSED_DATA_FOLDER, exist_ok=True)

# Load datasets
books = pd.read_csv(os.path.join(RAW_DATA_FOLDER, "Books.csv"))
users = pd.read_csv(os.path.join(RAW_DATA_FOLDER, "Users.csv"))
ratings = pd.read_csv(os.path.join(RAW_DATA_FOLDER, "Ratings.csv"))

# --- DATA CLEANING ---

# Handling missing values
books.dropna(subset=["Book-Title", "Book-Author"], inplace=True)
users["Age"].fillna(users["Age"].median(), inplace=True)
ratings.drop_duplicates(inplace=True)

# Convert 'Year-Of-Publication' to numeric
books["Year-Of-Publication"] = pd.to_numeric(books["Year-Of-Publication"], errors='coerce')
books.dropna(subset=["Year-Of-Publication"], inplace=True)
books["Year-Of-Publication"] = books["Year-Of-Publication"].astype(int)

# Remove outliers in Age (assuming valid ages are between 10 and 100)
users = users[(users["Age"] >= 10) & (users["Age"] <= 100)]

# Create new feature: Age of the book
books["Age-of-Book"] = 2025 - books["Year-Of-Publication"]

# Merge ratings count with books
df_ratings_count = ratings.groupby("ISBN").size().reset_index(name="Num-Ratings")
books = books.merge(df_ratings_count, on="ISBN", how="left")

# --- SAVE CLEANED DATA ---
books.to_csv(os.path.join(PROCESSED_DATA_FOLDER, "books_cleaned.csv"), index=False)
users.to_csv(os.path.join(PROCESSED_DATA_FOLDER, "users_cleaned.csv"), index=False)
ratings.to_csv(os.path.join(PROCESSED_DATA_FOLDER, "ratings_cleaned.csv"), index=False)

print("Data preprocessing completed. Cleaned files are saved in Dataset/processed folder.")


  books = pd.read_csv(os.path.join(RAW_DATA_FOLDER, "Books.csv"))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  users["Age"].fillna(users["Age"].median(), inplace=True)


Data preprocessing completed. Cleaned files are saved in Dataset/processed folder.
