analysis link:
- https://medium.com/@naviubhi29/data-analysis-and-visualization-on-myanimelist-data-71129f499d7a
- https://id.quora.com/Mengapa-genre-Slice-of-Life-banyak-yg-bilang-puncak-dari-wibu-Bukan-kah-itu-artinya-mulai-bosen-nonton-anime

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from PIL import Image
import uuid  # Import UUID module
import os
import json

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [3]:
def scrapData(start_ ,end_):
    for i in range(start_, end_, 50):      
        # Cek apakah file data_{i}.json sudah ada
        file_path = f'./dataset/data/data_{i}.json'
        if os.path.exists(file_path):
            # print(f"Data for limit={i} already exists. Skipping... ")
            continue
        
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")  # Run Chrome in headless mode
        options.add_argument("--disable-blink-features=AutomationControlled")

        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)
        driver_detail = webdriver.Chrome(service=service, options=options)
        
        driver.get(f"https://myanimelist.net/topanime.php?limit={i}")
        
        # Tunggu hingga elemen ranking-list muncul
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "ranking-list"))
            )
        except:
            # print(f"No data found for limit={i}. Skipping...")
            driver.quit()
            continue
        
        page = driver.page_source
        soup = BeautifulSoup(page, 'html.parser')
        list = soup.find_all('tr', class_='ranking-list')
        
        if not list:
            # print(f"No data found for limit={i}. Skipping...")
            driver.quit()
            continue
        
        data = []

        for idx, j in enumerate(list):
            uuid_data = str(uuid.uuid4())
            title = j.find('div', class_='di-ib').text
            rank = j.find('span', class_='lightLink').text
            link = j.find('a').get('href')
            mal_id = link.split('/')[4]
            
            anime_data = {
                'id': uuid_data,
                'mal_id': mal_id,
                'title': title,
                'image_url': '',
                'synopsis': '',
                'aired': '',
                'premiered': '',
                'member': '',
                'favorite': '',
                'rank': rank,
                'link': link,
                'episode': 0,
                'type': '',
                'genre': [],
                'producer': [],
                'studio': [],
                'theme': [],
                'demographic': [],
                'duration': '',
                'rating': '',
            }
            
            # # Detail 
            driver_detail.get(link)
            page_detail = driver_detail.page_source
            soup_detail = BeautifulSoup(page_detail, 'html.parser')
            
            # Image
            image_url_ = soup_detail.find('img', itemprop='image')
            if image_url_:
                anime_data['image_url'] =  image_url_['data-src']
            
            
            # Genre
            genre_list = soup_detail.find_all('span', itemprop='genre')
            if genre_list:
                anime_data['genre'] = [g.text for g in genre_list]
            
            # Synopsis
            synopsis_ = soup_detail.find('p', itemprop='description')
            if synopsis_:
                cleaned_synopsis = synopsis_.text.replace('\n', ' ')
                anime_data['synopsis'] = cleaned_synopsis
                
            # Producer
            producer_ = soup_detail.find('span', class_='dark_text', text='Producers:')
            if producer_:
                producers = [a.text for a in producer_.find_next_siblings('a')]
                anime_data['producer'] = producers
            
            # Studio
            studio_ = soup_detail.find('span', class_='dark_text', text='Studios:')
            if studio_:
                studios = [a.text for a in studio_.find_next_siblings('a')]
                anime_data['studio'] = studios

            # Theme
            theme_ = soup_detail.find('span', class_='dark_text', text='Theme:')
            if theme_:
                theme_ = [a.text for a in theme_.find_next_siblings('a')]
                anime_data['theme'] = theme_

            # Demographic
            demographic_ = soup_detail.find('span', class_='dark_text', text='Demographic:')
            if demographic_:
                demographic_ = [a.text for a in demographic_.find_next_siblings('a')]
                anime_data['demographic'] = demographic_

            # Episode
            episode_ = soup_detail.find('span', class_='dark_text', text='Episodes:')
            if episode_:
                anime_data['episode'] = episode_.find_parent().text.strip().split(':', 1)[1].replace("\n", "").replace(" ", "")

            # Type
            type_ = soup_detail.find('span', class_='dark_text', text='Type:')
            if type_:
                anime_data['type'] = type_.find_parent().text.strip().split(':', 1)[1].replace("\n", "").replace(" ", "")

            # Duration
            duration_ = soup_detail.find('span', class_='dark_text', text='Duration:')
            if duration_:
                anime_data['duration'] = duration_.find_parent().text.strip().split(':', 1)[1].replace("\n", "")

            # Rating
            rating_ = soup_detail.find('span', class_='dark_text', text='Rating:')
            if rating_:
                anime_data['rating'] = rating_.find_parent().text.strip().split(':', 1)[1].replace("\n", "").replace(" ", "")
                
            # Aired
            aired_ = soup_detail.find('span', class_='dark_text', text='Aired:')
            if aired_:
                anime_data['aired'] = aired_.find_parent().text.strip().split(':', 1)[1].replace("\n", "").replace(" ", "")
                
            # Premiered
            premiered_ = soup_detail.find('span', class_='dark_text', text='Premiered:')
            if premiered_:
                anime_data['premiered'] = premiered_.find_parent().text.strip().split(':', 1)[1].replace("\n", "").replace(" ", "")

            # Member
            member_ = soup_detail.find('span', class_='dark_text', text='Member:')
            if member_:
                anime_data['member'] = member_.find_parent().text.strip().split(':', 1)[1].replace("\n", "").replace(" ", "")

            # Favorite
            favorite_ = soup_detail.find('span', class_='dark_text', text='Favorites:')
            if favorite_:
                anime_data['favorite'] = favorite_.find_parent().text.strip().split(':', 1)[1].replace("\n", "").replace(" ", "")
                

            data.append(anime_data) 

            # Print progress
            progress = (idx + 1) / len(list) * 100
            print(f"\rScraping Data ke {i} [{'=' * int(progress // 10)}{' ' * (10 - int(progress // 10))}] {idx + 1}/{len(list)}", end="", flush=True)
        print()
        
        # Save data 
        with open(file_path, "w") as f:
            json.dump(data, f, indent=4)
        
        print(data)
        
        driver.quit()
    
    return print(f"Data Scrap {start_} to {end_} Completed")

In [4]:
scrapData(0, 5050)

Data Scrap 0 to 5050 Completed


In [5]:
scrapData(5050, 10050)

Data Scrap 5050 to 10050 Completed


In [6]:
scrapData(10050, 15050)

Data Scrap 10050 to 15050 Completed


In [7]:
scrapData(15050, 20050)

Data Scrap 15050 to 20050 Completed


In [None]:
scrapData(20050, 28050)

Data Scrap 20050 to 28000 Completed


In [9]:
# import time

# for idx in range(1, 50):  # Contoh loop dari 1 hingga 100
#     # Progress bar yang direplace
#     progress = (idx + 1) / 50 * 100
#     print(f"\r Scraping Data ke {i} [{'=' * int(progress // 10)}{' ' * (10 - int(progress // 10))}] {idx + 1}/{50}", end="", flush=True)
#     time.sleep(0.2)  # Simulasi delay
# print()  # Pindah ke baris baru setelah selesai

In [10]:
# options = webdriver.ChromeOptions()
# options.add_argument("--headless")  # Run Chrome in headless mode
# options.add_argument("--disable-blink-features=AutomationControlled")

# service = Service(ChromeDriverManager().install())
# driver = webdriver.Chrome(service=service, options=options)
# driver.get(f"https://myanimelist.net/topanime.php?limit=100")
# page = driver.page_source
# soup = BeautifulSoup(page, 'html.parser')
# list = soup.find_all('tr', class_='ranking-list')
# data = []

# for i in list:
#     uuid_data = str(uuid.uuid4())
#     title = i.find('div', class_='di-ib').text
#     rank = i.find('span', class_='lightLink').text
#     link = i.find('a').get('href')
#     mal_id = link.split('/')[4]
    
#     anime_data = {
#         'id': uuid_data,
#         'mal_id': mal_id,
#         'title': title,
#         'rank': rank,
#         'link': link,
#         'synopsis': '',
#         'episode': 0,
#         'type': '',
#         'genre': [],
#         'producer': [],
#         'studio': [],
#         'theme': [],
#         'demographic': [],
#         'duration': '',
#         'rating': '',
#     }
    
#     # # Detail 
#     driver.get(link)
#     page_detail = driver.page_source
#     soup_detail = BeautifulSoup(page_detail, 'html.parser')
    
#     # Synopsis
#     synopsis_ = soup_detail.find('p', itemprop='description')
#     if synopsis_.text:
#         cleaned_synopsis = synopsis_.text.replace('\n', ' ')
#         anime_data['synopsis'] = cleaned_synopsis
    
#     # Genre
#     genre_list = soup_detail.find_all('span', itemprop='genre')
#     if genre_list:
#         anime_data['genre'] = [g.text for g in genre_list]
    
#     # Producer
#     producer_ = soup_detail.find('span', class_='dark_text', text='Producers:')
#     if producer_:
#         producers = [a.text for a in producer_.find_next_siblings('a')]
#         anime_data['producer'] = producers
    
#     # Studio
#     studio_ = soup_detail.find('span', class_='dark_text', text='Studios:')
#     if studio_:
#         studios = [a.text for a in studio_.find_next_siblings('a')]
#         anime_data['studio'] = studios

#     # Theme
#     theme_ = soup_detail.find('span', class_='dark_text', text='Theme:')
#     if theme_:
#         theme_ = [a.text for a in theme_.find_next_siblings('a')]
#         anime_data['theme'] = theme_

#     # Demographic
#     demographic_ = soup_detail.find('span', class_='dark_text', text='Demographic:')
#     if demographic_:
#         demographic_ = [a.text for a in demographic_.find_next_siblings('a')]
#         anime_data['demographic'] = demographic_

#     # # Episode
#     episode_ = soup_detail.find('span', class_='dark_text', text='Episodes:')
#     if episode_:
#         anime_data['episode'] = episode_.find_parent().text.strip().split(':', 1)[1].replace("\n", "").replace(" ", "")

#     # Type
#     type_ = soup_detail.find('span', class_='dark_text', text='Type:')
#     if type_:
#         anime_data['type'] = type_.find_parent().text.strip().split(':', 1)[1].replace("\n", "").replace(" ", "")

#     # Duration
#     duration_ = soup_detail.find('span', class_='dark_text', text='Duration:')
#     if duration_:
#         anime_data['duration'] = duration_.find_parent().text.strip().split(':', 1)[1].replace("\n", "")

#     # Rating
#     rating_ = soup_detail.find('span', class_='dark_text', text='Rating:')
#     if rating_:
#         anime_data['rating'] = rating_.find_parent().text.strip().split(':', 1)[1].replace("\n", "").replace(" ", "")
    

#     data.append(anime_data)
    
# print(data)