In [7]:
import pandas as pd
import os

In [8]:
# Define file paths
RAW_DATA_PATH = "../data/raw"
CLEANED_DATA_PATH = "../data/cleaned"

# Ensure cleaned directory exists
os.makedirs(CLEANED_DATA_PATH, exist_ok=True)

# Load datasets
df_basic = pd.read_csv(f"{RAW_DATA_PATH}/Audible_Catlog.csv")
df_advanced = pd.read_csv(f"{RAW_DATA_PATH}/Audible_Catlog_Advanced_Features.csv")
df_processed = pd.read_csv("../data/processed/audible_catalog_processed.csv")

In [9]:
# Debugging: Print column names
print("Basic Dataset Columns:", df_basic.columns)
print("Advanced Dataset Columns:", df_advanced.columns)
print("Processed Dataset Columns:", df_processed.columns)


Basic Dataset Columns: Index(['Book Name', 'Author', 'Rating', 'Number of Reviews', 'Price'], dtype='object')
Advanced Dataset Columns: Index(['Book Name', 'Author', 'Rating', 'Number of Reviews', 'Price',
       'Description', 'Listening Time', 'Ranks and Genre'],
      dtype='object')
Processed Dataset Columns: Index(['Book Name', 'Author', 'Rating_x', 'Number of Reviews_x', 'Price_x',
       'Popularity Score', 'Rating_y', 'Number of Reviews_y', 'Price_y',
       'Description', 'Listening Time (mins)', 'Genres'],
      dtype='object')


In [7]:
# Standardize column names (remove spaces, lowercase)
df_basic.columns = df_basic.columns.str.strip()

In [6]:
print(df_basic.columns)


Index(['Book Name', 'Author', 'Rating', 'Number of Reviews', 'Price'], dtype='object')


In [7]:
print("Listening Time" in df_basic.columns)


False


In [8]:
# Check if 'Listening Time' exists
if "Listening Time" not in df_basic.columns:
    raise KeyError("The column 'Listening Time' is missing. Check dataset formatting!")

# Convert 'Listening Time' to minutes
def convert_listening_time(time_str):
    if pd.isna(time_str):
        return None
    time_parts = time_str.split(" ")
    hours = int(time_parts[0]) if "hour" in time_parts else 0
    minutes = int(time_parts[3]) if "minute" in time_parts else 0
    return hours * 60 + minutes

df_basic["Listening Time (mins)"] = df_basic["Listening Time"].apply(convert_listening_time)

# Save cleaned file
df_basic.to_csv(f"{CLEANED_DATA_PATH}/audible_catalog_cleaned.csv", index=False)
print("Data preprocessing completed successfully! 🚀")

KeyError: "The column 'Listening Time' is missing. Check dataset formatting!"

In [4]:
df_basic.head()

Unnamed: 0,Book Name,Author,Rating,Number of Reviews,Price
0,Think Like a Monk: The Secret of How to Harnes...,Jay Shetty,4.9,313.0,10080.0
1,Ikigai: The Japanese Secret to a Long and Happ...,Héctor García,4.6,3658.0,615.0
2,The Subtle Art of Not Giving a F*ck: A Counter...,Mark Manson,4.4,20174.0,10378.0
3,Atomic Habits: An Easy and Proven Way to Build...,James Clear,4.6,4614.0,888.0
4,Life's Amazing Secrets: How to Find Balance an...,Gaur Gopal Das,4.6,4302.0,1005.0


In [5]:
df_basic.columns

Index(['Book Name', 'Author', 'Rating', 'Number of Reviews', 'Price'], dtype='object')

In [6]:
df_advanced.columns

Index(['Book Name', 'Author', 'Rating', 'Number of Reviews', 'Price',
       'Description', 'Listening Time', 'Ranks and Genre'],
      dtype='object')

In [8]:
print("\nUnique Ratings:", df_advanced["Rating"].unique())


Unique Ratings: [ 4.9  4.6  4.4  4.1  4.5  3.6  5.   4.   4.2  4.8  4.7  3.4 -1.   3.9
  4.3  3.   3.8  2.5  2.   3.5  2.4  3.1  1.   3.2  3.7  3.3  2.9  1.9
  2.6  2.7]


In [9]:
# Count occurrences of each rating
rating_counts = df_advanced["Rating"].value_counts().sort_index()
print(rating_counts)


Rating
-1.0    421
 1.0      7
 1.9      1
 2.0      3
 2.4      2
 2.5      4
 2.6      4
 2.7      3
 2.9      3
 3.0     12
 3.1      7
 3.2      6
 3.3      6
 3.4     15
 3.5     14
 3.6     15
 3.7     35
 3.8     35
 3.9     56
 4.0    108
 4.1    143
 4.2    232
 4.3    357
 4.4    507
 4.5    665
 4.6    760
 4.7    626
 4.8    269
 4.9     49
 5.0     99
Name: count, dtype: int64


In [11]:
df.advanced["Rating"].value_counts().sort_index()

NameError: name 'df' is not defined

In [4]:
# Count occurrences of each rating
rating_counts = df_advanced["Number of Reviews"].value_counts().sort_index()
print(rating_counts)


Number of Reviews
1.0        61
2.0        43
3.0        46
4.0        30
5.0        24
           ..
33293.0     1
38310.0     1
40958.0     1
43869.0     1
70122.0     1
Name: count, Length: 1544, dtype: int64


In [8]:


if "Ranks and Genre" in df_advanced.columns:
    df_advanced[["Ranks", "Genres"]] = df_advanced["Ranks and Genre"].apply(lambda x: pd.Series(split_ranks_genre(x)))
    df_advanced.drop(columns=["Ranks and Genre"], inplace=True)

# Display the first few rows after splitting
print("\nAfter Splitting:")
display(df_advanced.head())



def split_ranks_genre(value):
    """Splits 'Ranks and Genre' column into 'Ranks' and 'Genres'."""
    if pd.isna(value):
        return pd.NA, pd.NA

    # Remove unnecessary text
    value = re.sub(r"in Audible Audiobooks & Originals \(See Top 100 in Audible Audiobooks & Originals\)", "", value)

    # Extract "Rank in Genre"
    rank_genre_pairs = re.findall(r"(#\d+) in ([^,]+)", value)

    ranks = []
    genres = []

    for rank, genre in rank_genre_pairs:
        ranks.append(rank)
        genres.append(f"{rank} in {genre.strip()}")  

    return ", ".join(ranks) if ranks else pd.NA, ", ".join(genres) if genres else pd.NA


NameError: name 'split_ranks_genre' is not defined