# This file is intended for data preprocessing and filtering

We suggest you don't run this code unless you want to reproduce the whole dataset

In [51]:
import numpy as np
import pandas as pd

One-hot encoding anime sources and genres

In [52]:
data_anime = pd.read_csv('Data/Animelist.csv')

In [53]:
tv = data_anime[data_anime['type'] == 'TV']

In [54]:
genres = tv['genre'].str.get_dummies(sep=', ')
genre_encoded = pd.concat([tv, genres], axis=1)

In [55]:
source_encoded = pd.get_dummies(genre_encoded['source'], prefix='source').astype(int)
data_filtered = pd.concat([genre_encoded, source_encoded], axis=1)

In [56]:
data_filtered

Unnamed: 0,anime_id,title,title_english,title_japanese,title_synonyms,image_url,type,source,episodes,status,...,source_Manga,source_Music,source_Novel,source_Original,source_Other,source_Picture book,source_Radio,source_Unknown,source_Visual novel,source_Web manga
0,11013,Inu x Boku SS,Inu X Boku Secret Service,妖狐×僕SS,Youko x Boku SS,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,12,Finished Airing,...,1,0,0,0,0,0,0,0,0,0
1,2104,Seto no Hanayome,My Bride is a Mermaid,瀬戸の花嫁,The Inland Sea Bride,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,26,Finished Airing,...,1,0,0,0,0,0,0,0,0,0
2,5262,Shugo Chara!! Doki,Shugo Chara!! Doki,しゅごキャラ！！どきっ,"Shugo Chara Ninenme, Shugo Chara! Second Year",https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,51,Finished Airing,...,1,0,0,0,0,0,0,0,0,0
3,721,Princess Tutu,Princess Tutu,プリンセスチュチュ,,https://myanimelist.cdn-dena.com/images/anime/...,TV,Original,38,Finished Airing,...,0,0,0,1,0,0,0,0,0,0
4,12365,Bakuman. 3rd Season,Bakuman.,バクマン。,Bakuman Season 3,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,25,Finished Airing,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14414,37662,Shinya! Tensai Bakabon,,深夜! 天才バカボン,Late Night! Genius Bakabon,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,0,Not yet aired,...,1,0,0,0,0,0,0,0,0,0
14431,37300,Xiao Li Yu Li Xian Ji,The Adventures of Little Carp,小鲤鱼历险记,"小鲤鱼历险记,",https://myanimelist.cdn-dena.com/images/anime/...,TV,Other,52,Finished Airing,...,0,0,0,0,1,0,0,0,0,0
14442,37428,Chara to Otamajakushi Shima,,キャラとおたまじゃくし島,,https://myanimelist.cdn-dena.com/images/anime/...,TV,Original,0,Currently Airing,...,0,0,0,1,0,0,0,0,0,0
14463,37787,Space Bug,,スペースバグ,,https://myanimelist.cdn-dena.com/images/anime/...,TV,Original,0,Not yet aired,...,0,0,0,1,0,0,0,0,0,0


Filtering users to preserve ones with a valid age and location

In [57]:
data_filtered.to_csv('Data/animelist_filt.csv')

In [58]:
users = pd.read_csv('Data/UserList.csv')
users.dropna(subset = ['birth_date', 'location', 'gender'], inplace=True)
users['birth_date'] = pd.to_datetime(users['birth_date'],errors='coerce')
users['age'] = ((pd.to_datetime('2018-01-01') - users['birth_date'])/pd.Timedelta(days=365.25)).dropna().round()
users = users[users['age'] > 12] # While there is no minimum age, its unlikely that kids will have their own anime lists

In [59]:
cities = pd.read_csv('Data/worldcities.csv', usecols=['city','country','iso2','population'])
# In cases when multiple cities share the same name, we keep the ones with higher population
# Since the probability of being in Paris, France is more than 100 times higher than in Paris, Texas
cities.dropna()
cities = cities.sort_values(by='population', ascending=False)
cities = cities.drop_duplicates(subset=['city'], keep='first')

In [60]:
city_to_country = dict(zip(cities['city'].str.lower(), cities['country'].values.tolist()))
country_abbr_to_country = dict(zip(cities['iso2'].str.upper(), cities['country']))
def infer_country(location, city_to_country, country_abbr_to_country):
    location = location.lower().strip()
    
    # Check for country name in the location string
    for country in country_abbr_to_country.values():
        if country.lower() in location:
            return country
    
    # Check for country in iso format
    if location.upper() in country_abbr_to_country:
        return country_abbr_to_country[location.upper()]
    
     # Check for exact city match
    if location in city_to_country:
        return city_to_country[location]
    
    # If no match found
    return 'Unknown'

In [61]:
users['country'] = users['location'].apply(lambda loc: infer_country(loc, city_to_country, country_abbr_to_country))
users = users[users['country'] != 'Unknown']

In [62]:
users.to_csv('Data/userlist_filt.csv')
users

Unnamed: 0,username,user_id,user_watching,user_completed,user_onhold,user_dropped,user_plantowatch,user_days_spent_watching,gender,location,birth_date,access_rank,join_date,last_online,stats_mean_score,stats_rewatched,stats_episodes,age,country
0,karthiga,2255153,3,49,1,0,0,55.31,Female,"Chennai, India",1990-04-29,,2013-03-03,2014-02-04 01:32:00,7.43,0.0,3391.0,28.0,India
1,RedvelvetDaisuki,1897606,61,396,39,0,206,118.07,Female,Manila,1995-01-01,,2012-12-13,1900-05-13 02:47:00,6.78,80.0,7094.0,23.0,Philippines
3,bskai,228342,25,414,2,5,11,167.16,Male,"Nayarit, Mexico",1990-12-14,,2009-08-31,2014-05-12 16:35:00,8.27,1.0,10081.0,27.0,Mexico
5,terune_uzumaki,327311,5,5,0,0,0,15.20,Female,"Malaysia, Kuantan",1998-08-24,,2010-05-10,2012-10-18 19:06:00,9.70,6.0,920.0,19.0,Malaysia
12,HimeAria,3129315,2,87,2,0,28,20.12,Female,Poland,1996-09-26,,2013-09-08,1900-04-27 10:52:00,8.23,0.0,1314.0,21.0,Poland
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302642,yoyoman9,6454558,2,32,6,7,2,24.42,Male,Boston,1999-06-27,,2017-08-03,2017-12-17 11:27:00,7.50,0.0,1483.0,19.0,United States of America
302648,isoann,797785,14,213,11,20,136,65.22,Male,Poland:,1997-01-13,,2011-10-23,1900-04-02 13:21:00,8.03,2.0,3944.0,21.0,Poland
302655,ammaretto,381078,0,1,1,0,0,0.49,Male,"Russia, Novokuznetsk",1990-08-02,,2010-09-09,2015-06-03 10:59:00,8.00,0.0,27.0,27.0,Russia
302657,Scarlet95,2478991,6,103,10,8,54,47.94,Female,Belgium,1995-10-17,,2013-04-24,2016-12-18 08:41:00,7.40,1.0,2906.0,22.0,Belgium


Making sure to drop the lists of filtered users
###### this code may take a few minutes to run

In [63]:
unique_users = users['username'].unique()
data_user_anime = pd.read_csv('Data/UserAnimeList.csv')
data_user_anime_filtered = data_user_anime[data_user_anime['username'].isin(unique_users)]

In [64]:
data_user_anime_filtered.to_csv('Data/userAnimeList_filt.csv')