# Sanime ETL - Scraping
## Scrape data from MyAnimeList website
Be careful to sign up at [MyAnimeList](https://myanimelist.net) and submit your request for API access to claim your client_id and fill config.json file

In [19]:
config_file = '../config.json'

importing and read config files

In [20]:
import os
import json
import requests
import pandas as pd
import pymongo
import bson.json_util as json_util

with open(config_file) as f:
    config = json.load(f)
    client_id = config['MyAnimeList']['client_id']
    mongo_cred = config['MongoDB']

create a function to download anime ids through requests module

In [26]:
def get_all_anime_ids(client_id:str ) -> list:
    """This function returns a list of all anime ids from MyAnimeList site API
    Args:
        client_id (str): The client id for the MyAnimeList API
    Returns:
        list: A list of all anime ids
    """
    url = 'https://api.myanimelist.net/v2/anime/ranking'
    headers = {
        'X-MAL-CLIENT-ID': client_id
    }
    anime_ids = []
    limit = 100
    offset = 0
    while True:
        params = {
            'limit': limit,
            'offset': offset
        }
        response = requests.get(url, headers=headers, params=params)
        if response and response.status_code == 200:
            data = response.json()
            anime_ids.extend(data['data'])
            if 'next' not in data['paging']:
                break
            offset += limit
        else:
            print(f"Errore {response.status_code}: {response.text}")
            break

    return anime_ids

download anime ids and save/backup them into a csv file

In [None]:
anime_ids_list = get_all_anime_ids(client_id)
df = pd.DataFrame(anime_ids_list, columns=['anime_id'])
df.to_csv('../data/src/anime_list.csv', index=False)

In [35]:
anime_ids_list = [1, 5]

function to download anime details

In [39]:
#funzione per ottenere i dettagli di un anime
def get_anime_details(anime_id:str, client_id:str) -> json:
    """This function returns the details of an anime from MyAnimeList site API
    Args:
        anime_id (str): The anime id
        client_id (str): The client id for the MyAnimeList API
    Returns:
        json: The details of the anime
    """
    url = f'https://api.myanimelist.net/v2/anime/{anime_id}?fields=id,title,main_picture,alternative_titles,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,my_list_status,num_episodes,start_season,broadcast,source,average_episode_duration,rating,pictures,background,related_anime,related_manga,recommendations,studios,statistics'
    headers = {
        'X-MAL-CLIENT-ID': client_id
    }
    response = requests.get(url, headers=headers, timeout=5)
    if response and response.status_code == 200:
        return response.json()
    else:
        print(f"Errore {response.status_code}: {response.text}")
        return None


download anime details

In [46]:
anime_details = []
for anime_id in anime_ids_list:
    anime_detail = get_anime_details(str(anime_id), client_id)
    if anime_detail:
        anime_details.append(anime_detail)

create a MongoDB connection to load data into MyAnimeList db

In [48]:
connection_string = f"mongodb+srv://{mongo_cred['username']}:{mongo_cred['password']}@{mongo_cred['database']}.{mongo_cred['cluster']}.mongodb.net/"
client = pymongo.MongoClient(connection_string)
db = client['MyAnimeList']

save data into src_anime_details collection

In [None]:
db.src_anime_details.insert_many(anime_details)

save source data locally (data/scr/anime_details folder) as [anime_id].json

In [None]:
anime_details_folder = '../data/src/anime_details/'

if not os.path.exists(anime_details_folder):
    os.makedirs(anime_details_folder)

for _, anime in enumerate(anime_details):
    file_name = os.path.join(anime_details_folder, f'anime_{anime["id"]}.json')
    with open(file_name, 'w') as f:
        print(json_util.dumps(anime, indent = 4), file=f)