# Transforming Data

## Import

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

## Load

In [2]:
df = pd.read_csv('../../data/shows_cleaned.csv')
df.head()

Unnamed: 0,title,url,img,score,rank,popularity,watching,completed,on_hold,dropped,...,premiered,broadcast,producers,licensors,studios,Source,genres,demographic,duration,rating
0,Sousou no Frieren,https://myanimelist.net/anime/52991/Sousou_no_...,https://cdn.myanimelist.net/images/anime/1015/...,9.291,1,120,223506,816061,25840,17421,...,Fall 2023,Fridays at 23,"Aniplex, Dentsu, Shogakukan-Shueisha Productio...",Crunchyroll,Madhouse,Manga,"Adventure, Drama, Fantasy",Shounen,24 min. per ep.,PG-13 - Teens 13 or older
1,Chainsaw Man Movie: Reze-hen,https://myanimelist.net/anime/57555/Chainsaw_M...,https://cdn.myanimelist.net/images/anime/1763/...,9.141,2,836,8276,201182,1257,293,...,Unknown,Unknown,"None found, add some","None found, add some",MAPPA,Manga,"Action, Fantasy",Shounen,1 hr. 39 min.,R - 17+ (violence & profanity)
2,Fullmetal Alchemist: Brotherhood,https://myanimelist.net/anime/5114/Fullmetal_A...,https://cdn.myanimelist.net/images/anime/1208/...,9.101,3,3,281402,2610786,119454,64087,...,Spring 2009,Sundays at 17,"Aniplex, Square Enix, Mainichi Broadcasting Sy...","Funimation, Aniplex of America",Bones,Manga,"Action, Adventure, Drama, Fantasy",Shounen,24 min. per ep.,R - 17+ (violence & profanity)
3,Steins;Gate,https://myanimelist.net/anime/9253/Steins_Gate,https://cdn.myanimelist.net/images/anime/1935/...,9.071,4,14,182045,1730415,94228,60909,...,Spring 2011,Wednesdays at 02,"Frontier Works, Media Factory, Kadokawa Shoten...",Funimation,White Fox,Visual novel,"Drama, Sci-Fi, Suspense",Unknown,24 min. per ep.,PG-13 - Teens 13 or older
4,Shingeki no Kyojin Season 3 Part 2,https://myanimelist.net/anime/38524/Shingeki_n...,https://cdn.myanimelist.net/images/anime/1517/...,9.051,5,21,86877,2286973,10071,8663,...,Spring 2019,Mondays at 00,"Production I.G, Dentsu, Mainichi Broadcasting ...",Funimation,Wit Studio,Manga,"Action, Drama, Suspense",Shounen,23 min. per ep.,R - 17+ (violence & profanity)


## Parse Duration to Minutes

In [45]:
def transform_duration(dur):
    dur = dur.split('. ')
    dur = [d.strip().replace('.', '') for d in dur if 'ep' not in d]
    min = 0
    for d in dur:
        if 'hr' in d:
            hr = int(d.replace('hr', '').strip())
            min += (hr * 60)
        elif 'min' in d:
            m = int(d.replace('min', '').strip())
            min += m
    return min

df['duration_minutes'] = df['duration'].apply(transform_duration).astype(int)

## Extract Year from Aired
Extract start and end years from the aired date range.

In [None]:
def extract_year(aired):
    # get year
    aired = aired.split(',')[-1]
    # get first year if range
    aired = aired.split('to')[0]
    # ignore month
    aired = aired.split(' ')[-1]
    return aired.strip()


year = df['aired'].apply(extract_year)

## Split Multi-Value Columns
Split comma-separated columns into lists.

In [78]:
df['genres_list'] = df['genres'].str.split(', ')
df['studios_list'] = df['studios'].str.split(', ')
df['producers_list'] = df['producers'].str.split(', ')
df['licensors_list'] = df['licensors'].str.split(', ')

df[['genres', 'genres_list']].head()

Unnamed: 0,genres,genres_list
0,"Adventure, Drama, Fantasy","[Adventure, Drama, Fantasy]"
1,"Action, Fantasy","[Action, Fantasy]"
2,"Action, Adventure, Drama, Fantasy","[Action, Adventure, Drama, Fantasy]"
3,"Drama, Sci-Fi, Suspense","[Drama, Sci-Fi, Suspense]"
4,"Action, Drama, Suspense","[Action, Drama, Suspense]"


## Calculate Popularity Metrics
Create derived metrics for engagement and popularity.

In [80]:
# Completion rate - percentage of members who completed the show
df['completion_rate'] = (df['completed'] / df['members'] * 100).round(2)

# Drop rate - percentage of members who dropped the show
df['drop_rate'] = (df['dropped'] / df['members'] * 100).round(2)

# Favorites rate - percentage of members who favorited the show
df['favorites_rate'] = (df['favorites'] / df['members'] * 100).round(2)

df[['title', 'members', 'completion_rate', 'drop_rate', 'favorites_rate']].head(10)

Unnamed: 0,title,members,completion_rate,drop_rate,favorites_rate
0,Sousou no Frieren,1283443,63.58,1.36,6.27
1,Chainsaw Man Movie: Reze-hen,325571,61.79,0.09,2.84
2,Fullmetal Alchemist: Brotherhood,3610503,72.31,1.78,6.61
3,Steins;Gate,2761133,62.67,2.21,7.23
4,Shingeki no Kyojin Season 3 Part 2,2527577,90.48,0.34,2.47
5,Gintama: The Final,177742,55.83,1.01,2.58
6,Gintama°,681128,42.29,3.0,2.55
7,Kingdom 6th Season,28030,0.04,0.76,1.36
8,Hunter x Hunter (2011),3115772,68.58,2.17,7.25
9,Ginga Eiyuu Densetsu,354680,22.68,2.67,4.94


## Encode Categorical Variables
Create numeric encodings for key categorical columns.

In [81]:
# Label encoding for type
type_mapping = {type_val: idx for idx, type_val in enumerate(df['type'].unique())}
df['type_encoded'] = df['type'].map(type_mapping)

# Label encoding for status
status_mapping = {status: idx for idx, status in enumerate(df['status'].unique())}
df['status_encoded'] = df['status'].map(status_mapping)

# Label encoding for rating
rating_mapping = {rating: idx for idx, rating in enumerate(df['rating'].unique())}
df['rating_encoded'] = df['rating'].map(rating_mapping)

print("Type mapping:", type_mapping)
print("\nStatus mapping:", status_mapping)
print("\nRating mapping:", rating_mapping)

Type mapping: {'TV': 0, 'Movie': 1, 'OVA': 2, 'TV Special': 3, 'ONA': 4, 'Special': 5}

Status mapping: {'Finished Airing': 0, 'Currently Airing': 1}

Rating mapping: {'PG-13 - Teens 13 or older': 0, 'R - 17+ (violence & profanity)': 1, 'R+ - Mild Nudity': 2, 'PG - Children': 3, 'G - All Ages': 4}


## Create Binary Features
Create binary flags for common characteristics.

In [83]:
# Is it currently airing?
df['is_airing'] = (df['status'] == 'Currently Airing').astype(int)

df[['title', 'is_airing']].head(10)

Unnamed: 0,title,is_airing
0,Sousou no Frieren,0
1,Chainsaw Man Movie: Reze-hen,0
2,Fullmetal Alchemist: Brotherhood,0
3,Steins;Gate,0
4,Shingeki no Kyojin Season 3 Part 2,0
5,Gintama: The Final,0
6,Gintama°,0
7,Kingdom 6th Season,1
8,Hunter x Hunter (2011),0
9,Ginga Eiyuu Densetsu,0


## Save

In [84]:
df.to_csv('../../data/shows_transformed.csv', index=False)
print("Transformed data saved to 'shows_transformed.csv'")

Transformed data saved to 'shows_transformed.csv'
