In [3]:
# If don't have, install only once
# !pip install kagglehub

import kagglehub
import pandas as pd
import os

path = kagglehub.dataset_download("snehangsude/audible-dataset")

df = pd.read_csv(os.path.join(path, 'audible_uncleaned.csv'))

# Check first views:
# df.head()
# df.describe()
# df.info()

In [4]:
# Clean text data in Author and Narrator columns
# Remove Writtenby: from the author column
df['author'] = df['author'].str.replace('Writtenby:', '')
# Remove Narratedby: from the narrator column
df['narrator'] = df['narrator'].str.replace('Narratedby:', '')
# df.head(2)

In [5]:
# Clean stars table
# Create separate column for ratings
df['ratings'] = df['stars'].str.extract(r'stars(\d+)\s')
df['ratings'] = df['ratings'].fillna(0).astype('int')
# Create separate column for stars
df['stars out of five'] = df['stars'].str.extract(r'(\d+)\sout')
df['stars out of five'] = df['stars out of five'].fillna(0).astype('float')
df.drop('stars', axis=1, inplace=True)
# df.head(2)
# df['stars out of five'].unique()
# df['ratings'].sample(n=30)

In [6]:
# Replace the comma with ''
df['price'] = df['price'].str.replace(',', '')
# Replace 'Free' with 0
df['price'] = df['price'].str.replace('Free', '0')
# Turn price to float
df['price'] = df['price'].astype(float)
# df['price'].sample(n=30)

In [7]:
# Convert releasedate to datetime
df['releasedate'] = pd.to_datetime(df['releasedate'])
# Inspect the dataframe 
# df.info()
# Search the entries in the time column for different spellings of min. Let' try min, mins, minutes
# df['time'].str.contains(r'\b(min|mins|minutes)\b', case=False, na=False)

# Replace hrs, mins, and 'Less than 1 minute'
df['time'] = df['time'].str.replace(r'\b(mins?|minutes?)\b', 'min', case=False, regex=True)
df['time'] = df['time'].str.replace(r'\b(hr?|hrs?)\b', 'hr', case=False, regex=True)
df['time'] = df['time'].str.replace('Less than 1 minute', '1 min')
# set(type(x) for x in df['time'])
# df['time'].isna().any()

# Extract the number of hours, turn to integer
df['hours'] = df['time'].str.extract(r'(\d+)\s*hr')
df['hours'] = df['hours'].fillna("0").astype('str')
# Extract the number of minutes, turn to integer
df['mins'] = df['time'].str.extract(r'and\s(\d+)\s*min')
df['mins'] = df['mins'].fillna("0").astype('int')
df['mins'] = df['mins'].apply(lambda x: f"{x:02}")
# Combine hours and minutes into the duration column
df['duration'] = df['hours'].astype('str') + ':' + df['mins'].astype('str')

df.drop('mins', axis=1, inplace=True)
df.drop('hours', axis=1, inplace=True)
# Check the results
df.head()

  df['releasedate'] = pd.to_datetime(df['releasedate'])


Unnamed: 0,name,author,narrator,time,releasedate,language,price,ratings,stars out of five,duration
0,Geronimo Stilton #11 & #12,GeronimoStilton,BillLobely,2 hr and 20 min,2008-04-08,English,468.0,34,5.0,2:20
1,The Burning Maze,RickRiordan,RobbieDaymond,13 hr and 8 min,2018-01-05,English,820.0,41,5.0,13:08
2,The Deep End,JeffKinney,DanRussell,2 hr and 3 min,2020-06-11,English,410.0,38,5.0,2:03
3,Daughter of the Deep,RickRiordan,SoneelaNankani,11 hr and 16 min,2021-05-10,English,615.0,12,5.0,11:16
4,"The Lightning Thief: Percy Jackson, Book 1",RickRiordan,JesseBernstein,10 hr,2010-01-13,English,820.0,181,5.0,10:00


In [8]:
# Transform prices to USD (multiply times 0.012)
df['price'] = df['price'] * 0.012
# Check the results
# df['price'].sample(n=30)

In [9]:
# Update capitalization in the language column
df['language'] = df['language'].str.capitalize()
# Check the results
# df['language'].sample(n=40)

In [None]:
# Checking the duplicates
# duplicates = df.duplicated().sum()
# print(duplicates)
subset_cols = ['name', 'author', 'narrator', 'duration', 'price']
duplicates = df.duplicated(subset_cols, keep=False)
print(duplicates.sample(n=40))
# drop duplicates
unduped_df = df.drop_duplicates()

1183     False
31728    False
30336    False
18977    False
49609    False
9585     False
2587     False
22585    False
31531    False
53013    False
43377    False
81450    False
1080     False
4734     False
73785    False
6547     False
74115    False
83314    False
48616    False
11731    False
53071    False
59766    False
61584    False
77504    False
87375    False
19870    False
48920    False
40482    False
60059    False
16731    False
59108    False
2417     False
68204    False
65545    False
73185    False
70491    False
17423    False
7752     False
9137     False
9741     False
dtype: bool


In [None]:
#  Nan values
# print(df.isna().sum())

name                 0
author               0
narrator             0
time                 0
releasedate          0
language             0
price                0
ratings              0
stars out of five    0
duration             0
dtype: int64


In [None]:
# Save to csv
df.to_csv('audible.csv', index=False)