## Feature Engineering & Data Preprocessing Demo

In [515]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split

# load CHN data
df= pd.read_csv('AUC.csv')

df.head(10)

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price
0,Geronimo Stilton #11 & #12,Writtenby:GeronimoStilton,Narratedby:BillLobely,2 hrs and 20 mins,04-08-08,English,5 out of 5 stars34 ratings,468.0
1,The Burning Maze,Writtenby:RickRiordan,Narratedby:RobbieDaymond,13 hrs and 8 mins,01-05-18,English,4.5 out of 5 stars41 ratings,820.0
2,The Deep End,Writtenby:JeffKinney,Narratedby:DanRussell,2 hrs and 3 mins,06-11-20,English,4.5 out of 5 stars38 ratings,410.0
3,Daughter of the Deep,Writtenby:RickRiordan,Narratedby:SoneelaNankani,11 hrs and 16 mins,05-10-21,English,4.5 out of 5 stars12 ratings,615.0
4,"The Lightning Thief: Percy Jackson, Book 1",Writtenby:RickRiordan,Narratedby:JesseBernstein,10 hrs,13-01-10,English,4.5 out of 5 stars181 ratings,820.0
5,The Hunger Games: Special Edition,Writtenby:SuzanneCollins,Narratedby:TatianaMaslany,10 hrs and 35 mins,30-10-18,English,5 out of 5 stars72 ratings,656.0
6,Quest for the Diamond Sword,Writtenby:WinterMorgan,Narratedby:LukeDaniels,2 hrs and 23 mins,25-11-14,English,5 out of 5 stars11 ratings,233.0
7,The Dark Prophecy,Writtenby:RickRiordan,Narratedby:RobbieDaymond,12 hrs and 32 mins,02-05-17,English,5 out of 5 stars50 ratings,820.0
8,Merlin Mission Collection,Writtenby:MaryPopeOsborne,Narratedby:MaryPopeOsborne,10 hrs and 56 mins,02-05-17,English,5 out of 5 stars5 ratings,1256.0
9,The Tyrant’s Tomb,Writtenby:RickRiordan,Narratedby:RobbieDaymond,13 hrs and 22 mins,24-09-19,English,5 out of 5 stars58 ratings,820.0


In [516]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87489 entries, 0 to 87488
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         87489 non-null  object
 1   author       87489 non-null  object
 2   narrator     87489 non-null  object
 3   time         87489 non-null  object
 4   releasedate  87489 non-null  object
 5   language     87489 non-null  object
 6   stars        87489 non-null  object
 7   price        87489 non-null  object
dtypes: object(8)
memory usage: 5.3+ MB


In [517]:
# Clean 'author' and 'narrator' columns
df['author'] = df['author'].str.replace(r'Writtenby:', '', regex=True)
df['narrator'] = df['narrator'].str.replace(r'Narratedby:', '', regex=True)

df.head()

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price
0,Geronimo Stilton #11 & #12,GeronimoStilton,BillLobely,2 hrs and 20 mins,04-08-08,English,5 out of 5 stars34 ratings,468.0
1,The Burning Maze,RickRiordan,RobbieDaymond,13 hrs and 8 mins,01-05-18,English,4.5 out of 5 stars41 ratings,820.0
2,The Deep End,JeffKinney,DanRussell,2 hrs and 3 mins,06-11-20,English,4.5 out of 5 stars38 ratings,410.0
3,Daughter of the Deep,RickRiordan,SoneelaNankani,11 hrs and 16 mins,05-10-21,English,4.5 out of 5 stars12 ratings,615.0
4,"The Lightning Thief: Percy Jackson, Book 1",RickRiordan,JesseBernstein,10 hrs,13-01-10,English,4.5 out of 5 stars181 ratings,820.0


In [518]:
# Extract numeric rating from 'stars'
df['rating'] = df['stars'].str.extract(r'(\d+(\.\d+)?)')[0].astype(float)

df.head()

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price,rating
0,Geronimo Stilton #11 & #12,GeronimoStilton,BillLobely,2 hrs and 20 mins,04-08-08,English,5 out of 5 stars34 ratings,468.0,5.0
1,The Burning Maze,RickRiordan,RobbieDaymond,13 hrs and 8 mins,01-05-18,English,4.5 out of 5 stars41 ratings,820.0,4.5
2,The Deep End,JeffKinney,DanRussell,2 hrs and 3 mins,06-11-20,English,4.5 out of 5 stars38 ratings,410.0,4.5
3,Daughter of the Deep,RickRiordan,SoneelaNankani,11 hrs and 16 mins,05-10-21,English,4.5 out of 5 stars12 ratings,615.0,4.5
4,"The Lightning Thief: Percy Jackson, Book 1",RickRiordan,JesseBernstein,10 hrs,13-01-10,English,4.5 out of 5 stars181 ratings,820.0,4.5


In [519]:
# Extract number of people who rated
df['num_ratings'] = df['stars'].str.extract(r'\d+ out of \d+ stars(\d+) ratings')[0].astype(float)

df.head()

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price,rating,num_ratings
0,Geronimo Stilton #11 & #12,GeronimoStilton,BillLobely,2 hrs and 20 mins,04-08-08,English,5 out of 5 stars34 ratings,468.0,5.0,34.0
1,The Burning Maze,RickRiordan,RobbieDaymond,13 hrs and 8 mins,01-05-18,English,4.5 out of 5 stars41 ratings,820.0,4.5,41.0
2,The Deep End,JeffKinney,DanRussell,2 hrs and 3 mins,06-11-20,English,4.5 out of 5 stars38 ratings,410.0,4.5,38.0
3,Daughter of the Deep,RickRiordan,SoneelaNankani,11 hrs and 16 mins,05-10-21,English,4.5 out of 5 stars12 ratings,615.0,4.5,12.0
4,"The Lightning Thief: Percy Jackson, Book 1",RickRiordan,JesseBernstein,10 hrs,13-01-10,English,4.5 out of 5 stars181 ratings,820.0,4.5,181.0


In [520]:
# Convert 'price' to numeric
df['price'] = pd.to_numeric(df['price'], errors='coerce')

df.head()

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price,rating,num_ratings
0,Geronimo Stilton #11 & #12,GeronimoStilton,BillLobely,2 hrs and 20 mins,04-08-08,English,5 out of 5 stars34 ratings,468.0,5.0,34.0
1,The Burning Maze,RickRiordan,RobbieDaymond,13 hrs and 8 mins,01-05-18,English,4.5 out of 5 stars41 ratings,820.0,4.5,41.0
2,The Deep End,JeffKinney,DanRussell,2 hrs and 3 mins,06-11-20,English,4.5 out of 5 stars38 ratings,410.0,4.5,38.0
3,Daughter of the Deep,RickRiordan,SoneelaNankani,11 hrs and 16 mins,05-10-21,English,4.5 out of 5 stars12 ratings,615.0,4.5,12.0
4,"The Lightning Thief: Percy Jackson, Book 1",RickRiordan,JesseBernstein,10 hrs,13-01-10,English,4.5 out of 5 stars181 ratings,820.0,4.5,181.0


In [521]:
cols_to_remove = ['time', 'stars']
df.drop(cols_to_remove, axis=1, inplace=True)

df.head()

Unnamed: 0,name,author,narrator,releasedate,language,price,rating,num_ratings
0,Geronimo Stilton #11 & #12,GeronimoStilton,BillLobely,04-08-08,English,468.0,5.0,34.0
1,The Burning Maze,RickRiordan,RobbieDaymond,01-05-18,English,820.0,4.5,41.0
2,The Deep End,JeffKinney,DanRussell,06-11-20,English,410.0,4.5,38.0
3,Daughter of the Deep,RickRiordan,SoneelaNankani,05-10-21,English,615.0,4.5,12.0
4,"The Lightning Thief: Percy Jackson, Book 1",RickRiordan,JesseBernstein,13-01-10,English,820.0,4.5,181.0


In [522]:
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
print("Kolom Numerik:", numerical_cols)

categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()
print("Kolom Kategorikal:", categorical_cols)

Kolom Numerik: ['price', 'rating', 'num_ratings']
Kolom Kategorikal: ['name', 'author', 'narrator', 'releasedate', 'language']


In [523]:
df.duplicated().sum()

np.int64(49)

In [524]:
df.drop_duplicates(inplace=True)

df.duplicated().sum()

np.int64(0)

In [525]:
df.isna().sum()

name               0
author             0
narrator           0
releasedate        0
language           0
price           9473
rating         72374
num_ratings    78230
dtype: int64

In [526]:
# Menggantikan nilai missing value pada kolom 'rating' dengan rata-rata kolom tersebut
df['rating'].fillna(df['rating'].mean(), inplace=True)
df.head()

Unnamed: 0,name,author,narrator,releasedate,language,price,rating,num_ratings
0,Geronimo Stilton #11 & #12,GeronimoStilton,BillLobely,04-08-08,English,468.0,5.0,34.0
1,The Burning Maze,RickRiordan,RobbieDaymond,01-05-18,English,820.0,4.5,41.0
2,The Deep End,JeffKinney,DanRussell,06-11-20,English,410.0,4.5,38.0
3,Daughter of the Deep,RickRiordan,SoneelaNankani,05-10-21,English,615.0,4.5,12.0
4,"The Lightning Thief: Percy Jackson, Book 1",RickRiordan,JesseBernstein,13-01-10,English,820.0,4.5,181.0


In [527]:
# Encode categorical variable 'language'
df = pd.get_dummies(df, columns=['language'], drop_first=True)

df.head()

Unnamed: 0,name,author,narrator,releasedate,price,rating,num_ratings,language_Hindi,language_afrikaans,language_arabic,...,language_romanian,language_russian,language_slovene,language_spanish,language_swedish,language_tamil,language_telugu,language_turkish,language_ukrainian,language_urdu
0,Geronimo Stilton #11 & #12,GeronimoStilton,BillLobely,04-08-08,468.0,5.0,34.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,The Burning Maze,RickRiordan,RobbieDaymond,01-05-18,820.0,4.5,41.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,The Deep End,JeffKinney,DanRussell,06-11-20,410.0,4.5,38.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Daughter of the Deep,RickRiordan,SoneelaNankani,05-10-21,615.0,4.5,12.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,"The Lightning Thief: Percy Jackson, Book 1",RickRiordan,JesseBernstein,13-01-10,820.0,4.5,181.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
