# Data cleaning

---

In [176]:
import pandas as pd
import re

df = pd.read_csv("audible_uncleaned.csv")

# df.info(verbose=True, show_counts=True)

# 1. Remove the "Written by:" and "Narrated by:" prefixes from the author and narrator columns [data messiness]
# 2. Convert the author and narrator columns to title case [data consistency]
# 3. Remove the row with the author as "Written by: Various authors" [data accuracy]
# 4. Convert the releasedate column to a datetime object [data validity]
# 5. Convert the time column to minutes [data validity]
# 6. Convert the stars column to a stars and total column [data accuracy]
# 8. Drop the rows with Not rated yet in the stars column [data completeness, data accuracy]
# 7. Remove the 'ratings' from the total column [data messiness]
# 9. Convert the language column to a categorical column [data validity]

In [177]:
df["author"] = df["author"].str.removeprefix("Writtenby:")
df["narrator"] = df["narrator"].str.removeprefix("Narratedby:")

In [178]:
titlecase_to_spaced = lambda x: re.sub(r"([a-z])([A-Z])", r"\1 \2", x)

df["narrator"] = df["narrator"].apply(titlecase_to_spaced)
df["author"] = df["author"].apply(titlecase_to_spaced)

In [179]:
df = df[df["author"] != "Writtenby:Variousauthors"]

In [180]:
datetime = lambda x: re.sub(r"(\d+)-(\d+)-(\d+)", r"20\3-\2-\1", x)

df["releasedate"] = df["releasedate"].apply(datetime)
df["releasedate"] = pd.to_datetime(df["releasedate"])

In [None]:
# time_pattern = re.compile(r"(?:(\d+)\s*hrs?)?\s*(?:and)?\s*(?:(\d+)\s*min)?")


# def time(x):
#     match = time_pattern.match(x)
#     if match:
#         hours = int(match.group(1) or 0)
#         minutes = int(match.group(2) or 0)
#         return hours * 60 + minutes

# df["time"] = df["time"].apply(time)

In [182]:
df = df[df["stars"] != "Not rated yet"]

In [None]:
df[["stars", "ratings"]] = df['stars'].str.split("out of 5 stars", expand=True)
df["stars"] = df["stars"].astype(float)
df["ratings"] = df["ratings"].str.extract(r"(\d+)").astype(int)

In [184]:
df

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price,ratings
0,Geronimo Stilton #11 & #12,Geronimo Stilton,Bill Lobely,140,2008-08-04,English,5.0,468.00,34
1,The Burning Maze,Rick Riordan,Robbie Daymond,788,2018-05-01,English,4.5,820.00,41
2,The Deep End,Jeff Kinney,Dan Russell,123,2020-11-06,English,4.5,410.00,38
3,Daughter of the Deep,Rick Riordan,Soneela Nankani,676,2021-10-05,English,4.5,615.00,12
4,"The Lightning Thief: Percy Jackson, Book 1",Rick Riordan,Jesse Bernstein,600,2010-01-13,English,4.5,820.00,181
...,...,...,...,...,...,...,...,...,...
87228,Why We Fly,Evan Rail,Evan Rail,56,2014-06-23,English,5.0,100.00,1
87337,River Town,Peter Hessler,Peter Berkrot,876,2010-05-04,English,5.0,836.00,1
87340,Figures in a Landscape,Paul Theroux,Edoardo Ballerini,1004,2018-06-07,English,4.0,949.00,1
87362,Tower of London,i Minds,Abbey Holmes,8,2009-12-23,English,5.0,33.00,1
