# This file is intended for data preprocessing and filtering

We suggest you don't run this code unless you want to reproduce the whole dataset

In [1]:
import numpy as np
import pandas as pd

One-hot encoding anime sources and genres

In [2]:
data_anime = pd.read_csv('Data/Animelist.csv')

In [3]:
tv = data_anime[data_anime['type'] == 'TV']

In [4]:
genres = tv['genre'].str.get_dummies(sep=', ', prefix='genre')
genre_encoded = pd.concat([tv, genres], axis=1)

TypeError: get_dummies() got an unexpected keyword argument 'prefix'

In [None]:
source_encoded = pd.get_dummies(genre_encoded['source'], prefix='source').astype(int)
data_filtered = pd.concat([genre_encoded, source_encoded], axis=1)

In [None]:
pd.set_option('display.max_columns', None)
data_filtered

Filtering users to preserve ones with a valid age and location

In [None]:
data_filtered.to_csv('Data/animelist_filt.csv')

In [None]:
users = pd.read_csv('Data/UserList.csv')
users.dropna(subset = ['birth_date', 'location', 'gender'], inplace=True)
users['birth_date'] = pd.to_datetime(users['birth_date'],errors='coerce')
users['age'] = ((pd.to_datetime('2018-01-01') - users['birth_date'])/pd.Timedelta(days=365.25)).dropna().round()
users = users[users['age'] > 12] # While there is no minimum age, its unlikely that kids will have their own anime lists

In [None]:
cities = pd.read_csv('Data/worldcities.csv', usecols=['city','country','iso2','population'])
# In cases when multiple cities share the same name, we keep the ones with higher population
# Since the probability of being in Paris, France is more than 100 times higher than in Paris, Texas
cities.dropna()
cities = cities.sort_values(by='population', ascending=False)
cities = cities.drop_duplicates(subset=['city'], keep='first')

In [None]:
city_to_country = dict(zip(cities['city'].str.lower(), cities['country'].values.tolist()))
country_abbr_to_country = dict(zip(cities['iso2'].str.upper(), cities['country']))
def infer_country(location, city_to_country, country_abbr_to_country):
    location = location.lower().strip()
    
    # Check for country name in the location string
    for country in country_abbr_to_country.values():
        if country.lower() in location:
            return country
    
    # Check for country in iso format
    if location.upper() in country_abbr_to_country:
        return country_abbr_to_country[location.upper()]
    
     # Check for exact city match
    if location in city_to_country:
        return city_to_country[location]
    
    # If no match found
    return 'Unknown'

In [None]:
users['country'] = users['location'].apply(lambda loc: infer_country(loc, city_to_country, country_abbr_to_country))
users = users[users['country'] != 'Unknown']

In [None]:
users.to_csv('Data/userlist_filt.csv')
users

Making sure to drop the lists of filtered users
###### this code may take a few minutes to run

In [None]:
unique_users = users['username'].unique()
data_user_anime = pd.read_csv('Data/UserAnimeList.csv')
data_user_anime_filtered = data_user_anime[data_user_anime['username'].isin(unique_users)]

In [None]:
data_user_anime_filtered.to_csv('Data/userAnimeList_filt.csv')