# Data Cleaning and Preprocessing for anime dataset

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
sb.set() 

In [4]:
file_path = [
    './Dataset/anime.csv',
]

---

# Exploring and Cleaning `anime.csv` first

In [7]:
data = pd.read_csv(file_path[0])
pd.set_option('display.max_columns', None)
data.head(n=3)

Unnamed: 0,id,title,main_picture,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,num_episodes,start_season,broadcast,source,average_episode_duration,rating,background,studios,statistics
0,95,Turn A Gundam,{'medium': 'https://api-cdn.myanimelist.net/im...,1999-04-09,2000-04-14,"It is the Correct Century, two millennia after...",7.71,1049,2892,40743,13338,white,2005-11-11T13:54:05+00:00,2022-03-22T03:37:48+00:00,tv,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 2, 'name'...",50,"{'year': 1999, 'season': 'spring'}","{'day_of_the_week': 'friday', 'start_time': '1...",original,1445,pg_13,,"[{'id': 14, 'name': 'Sunrise'}, {'id': 1260, '...","{'status': {'watching': '2735', 'completed': '..."
1,3665,Ginga Eiyuu Densetsu Gaiden (1999),{'medium': 'https://api-cdn.myanimelist.net/im...,1999-12-24,2000-07-21,Ginga Eiyuu Densetsu Gaiden (1999) is the seco...,8.07,472,4347,17849,6478,white,2008-01-02T21:05:12+00:00,2022-03-18T19:39:11+00:00,ova,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 8, 'name'...",28,"{'year': 1999, 'season': 'fall'}",,novel,1560,r,,"[{'id': 8, 'name': 'Artland'}, {'id': 207, 'na...","{'status': {'watching': '814', 'completed': '8..."
2,2471,Doraemon (1979),{'medium': 'https://api-cdn.myanimelist.net/im...,1979-04-02,2005-03-18,Nobita Nobi is a normal fourth grade student. ...,7.74,976,2553,51255,23826,white,2007-05-11T22:02:20+00:00,2022-03-23T06:24:35+00:00,tv,finished_airing,"[{'id': 2, 'name': 'Adventure'}, {'id': 4, 'na...",1787,"{'year': 1979, 'season': 'spring'}",,manga,660,pg,,"[{'id': 247, 'name': 'Shin-Ei Animation'}]","{'status': {'watching': '4637', 'completed': '..."


---

# 1. Checking missing values & Unused Features

In [8]:
print("Missing values in dataframe:")
print(data.isnull().sum())
print("data shape: ", data.shape)

Missing values in dataframe:
id                             0
title                          0
main_picture                   0
start_date                     0
end_date                     876
synopsis                     128
mean                         116
rank                           0
popularity                     0
num_list_users                 0
num_scoring_users              0
nsfw                           0
created_at                     0
updated_at                     0
media_type                     0
status                         0
genres                         8
num_episodes                   0
start_season                   0
broadcast                   5372
source                       901
average_episode_duration       0
rating                        61
background                  7067
studios                        0
statistics                     0
dtype: int64
data shape:  (8777, 26)


## 1a. Dropping Unused Features

In [9]:
data_clean = data.drop(columns=['main_picture', 'created_at', 'updated_at', 'background'])
data_clean.head(n=3)

Unnamed: 0,id,title,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,media_type,status,genres,num_episodes,start_season,broadcast,source,average_episode_duration,rating,studios,statistics
0,95,Turn A Gundam,1999-04-09,2000-04-14,"It is the Correct Century, two millennia after...",7.71,1049,2892,40743,13338,white,tv,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 2, 'name'...",50,"{'year': 1999, 'season': 'spring'}","{'day_of_the_week': 'friday', 'start_time': '1...",original,1445,pg_13,"[{'id': 14, 'name': 'Sunrise'}, {'id': 1260, '...","{'status': {'watching': '2735', 'completed': '..."
1,3665,Ginga Eiyuu Densetsu Gaiden (1999),1999-12-24,2000-07-21,Ginga Eiyuu Densetsu Gaiden (1999) is the seco...,8.07,472,4347,17849,6478,white,ova,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 8, 'name'...",28,"{'year': 1999, 'season': 'fall'}",,novel,1560,r,"[{'id': 8, 'name': 'Artland'}, {'id': 207, 'na...","{'status': {'watching': '814', 'completed': '8..."
2,2471,Doraemon (1979),1979-04-02,2005-03-18,Nobita Nobi is a normal fourth grade student. ...,7.74,976,2553,51255,23826,white,tv,finished_airing,"[{'id': 2, 'name': 'Adventure'}, {'id': 4, 'na...",1787,"{'year': 1979, 'season': 'spring'}",,manga,660,pg,"[{'id': 247, 'name': 'Shin-Ei Animation'}]","{'status': {'watching': '4637', 'completed': '..."


In [10]:
data_clean.shape

(8777, 22)

## 1b. Filling in NaN values 

In [11]:

data_clean["synopsis"].fillna(value = "no_Synopsis", inplace = True)
data_clean["end_date"].fillna(value = "airing", inplace = True)
data_clean["broadcast"].fillna(value = "{'day_of_the_week': 'NIL', 'start_time': 'NIL'}", inplace = True)
data_clean["source"].fillna(value = "unknown", inplace = True)
data_clean["rating"].fillna(value = "no_rating", inplace = True)
data_clean["genres"].fillna(value = "[{'id': -1, 'name': 'no_genre'}]", inplace = True)
data_clean["mean"].fillna(value = "-1", inplace = True)
data_clean.isnull().sum()

id                          0
title                       0
start_date                  0
end_date                    0
synopsis                    0
mean                        0
rank                        0
popularity                  0
num_list_users              0
num_scoring_users           0
nsfw                        0
media_type                  0
status                      0
genres                      0
num_episodes                0
start_season                0
broadcast                   0
source                      0
average_episode_duration    0
rating                      0
studios                     0
statistics                  0
dtype: int64

In [12]:
data_clean.head()

Unnamed: 0,id,title,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,media_type,status,genres,num_episodes,start_season,broadcast,source,average_episode_duration,rating,studios,statistics
0,95,Turn A Gundam,1999-04-09,2000-04-14,"It is the Correct Century, two millennia after...",7.71,1049,2892,40743,13338,white,tv,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 2, 'name'...",50,"{'year': 1999, 'season': 'spring'}","{'day_of_the_week': 'friday', 'start_time': '1...",original,1445,pg_13,"[{'id': 14, 'name': 'Sunrise'}, {'id': 1260, '...","{'status': {'watching': '2735', 'completed': '..."
1,3665,Ginga Eiyuu Densetsu Gaiden (1999),1999-12-24,2000-07-21,Ginga Eiyuu Densetsu Gaiden (1999) is the seco...,8.07,472,4347,17849,6478,white,ova,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 8, 'name'...",28,"{'year': 1999, 'season': 'fall'}","{'day_of_the_week': 'NIL', 'start_time': 'NIL'}",novel,1560,r,"[{'id': 8, 'name': 'Artland'}, {'id': 207, 'na...","{'status': {'watching': '814', 'completed': '8..."
2,2471,Doraemon (1979),1979-04-02,2005-03-18,Nobita Nobi is a normal fourth grade student. ...,7.74,976,2553,51255,23826,white,tv,finished_airing,"[{'id': 2, 'name': 'Adventure'}, {'id': 4, 'na...",1787,"{'year': 1979, 'season': 'spring'}","{'day_of_the_week': 'NIL', 'start_time': 'NIL'}",manga,660,pg,"[{'id': 247, 'name': 'Shin-Ei Animation'}]","{'status': {'watching': '4637', 'completed': '..."
3,21,One Piece,1999-10-20,airing,"Gol D. Roger was known as the ""Pirate King,"" t...",8.63,66,26,1812581,1020274,white,tv,currently_airing,"[{'id': 1, 'name': 'Action'}, {'id': 2, 'name'...",0,"{'year': 1999, 'season': 'fall'}","{'day_of_the_week': 'sunday', 'start_time': '0...",manga,1440,pg_13,"[{'id': 18, 'name': 'Toei Animation'}]","{'status': {'watching': '1227452', 'completed'..."
4,2397,Digimon Adventure: Bokura no War Game!,2000-03-04,2000-03-04,This movie takes place after the Adventure ser...,7.77,924,2135,70125,43599,white,movie,finished_airing,"[{'id': 2, 'name': 'Adventure'}, {'id': 4, 'na...",1,"{'year': 2000, 'season': 'winter'}","{'day_of_the_week': 'NIL', 'start_time': 'NIL'}",original,2460,pg,"[{'id': 18, 'name': 'Toei Animation'}]","{'status': {'watching': '653', 'completed': '6..."


---


# 2. Cleaning features by converting into json and splitting into different columns

In [13]:
import json

In [14]:
def split_start_season(data_clean):
    data_clean['start_season_year'] = np.nan
    data_clean['start_season_season'] = np.nan

    
    for row in range(0,len(data_clean)):
        if data_clean['start_season'][row] == float('NaN'):
            continue
        start_season = (json.loads(data_clean['start_season'][row].replace("'", "\"")))
        year = start_season['year']
        season = start_season['season']

        data_clean['start_season_year'][row] = year
        data_clean['start_season_season'][row] = season
    
    data_clean.drop(columns=['start_season'], inplace=True)
    
    
    return data_clean

In [15]:
def split_broadcast(data_clean):
    data_clean['broadcast_day_of_the_week'] = np.nan
    data_clean['broadcast_start_time'] = np.nan

    for row in range(0,len(data_clean)):
        broadcast = (json.loads(data_clean['broadcast'][row].replace("'", "\"")))


        data_clean['broadcast_day_of_the_week'][row] = broadcast['day_of_the_week']

        try:
            data_clean['broadcast_start_time'][row] = broadcast['start_time']
        except:
            data_clean['broadcast_start_time'][row] = 'NIL'
    
    data_clean.drop(columns=['broadcast'], inplace=True)
    
    
    return data_clean

In [17]:
def split_statistics(data_clean):
    data_clean['statistics_watching'] = np.nan
    data_clean['statistics_completed'] = np.nan
    data_clean['statistics_on_hold'] = np.nan
    data_clean['statistics_dropped'] = np.nan
    data_clean['statistics_plan_to_watch'] = np.nan
    data_clean['statistics_num_list_users'] = np.nan


    for row in range(0,len(data_clean)):
        statistics = (json.loads(data_clean['statistics'][row].replace("'", "\"")))

        data_clean['statistics_watching'][row] = statistics['status']['watching']
        data_clean['statistics_completed'][row] = statistics['status']['completed']
        data_clean['statistics_on_hold'][row] = statistics['status']['on_hold']
        data_clean['statistics_dropped'][row] = statistics['status']['dropped']
        data_clean['statistics_plan_to_watch'][row] = statistics['status']['plan_to_watch']
        data_clean['statistics_num_list_users'][row] = statistics['num_list_users']
    data_clean.drop(columns=['statistics'], inplace=True)
    
    
    return data_clean