## Scraping Data from Website + Pandas

In [2]:
from bs4 import BeautifulSoup
import requests

In [3]:
# URL de la lista de animes por número de episodios
url = "https://en.wikipedia.org/wiki/List_of_anime_series_by_episode_count"

# Cabecera para evitar el bloqueo del servidor
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/125.0.0.0 Safari/537.36"
}

# Hacer la petición
page = requests.get(url, headers=headers)

# Crear el objeto BeautifulSoup
soup = BeautifulSoup(page.text, "html")

In [3]:
#print(soup)

In [4]:
#soup.find_all('table')[1]

In [5]:
#table = soup.find_all('table')[1]

In [4]:
table = soup.find('table', class_ ='wikitable sortable')

In [5]:
headers = [th.text.strip() for th in table.find_all('th')]

In [6]:
print(headers)

['No.', 'Series title', 'Started broadcasting', 'Finished broadcasting', 'Episode count', 'Runtime']


In [7]:
import pandas as pd

In [13]:
column_data=table.find_all('tr')

In [15]:
all_rows = []
last_rank = None
pending_rowspans = {}  # columna: [valor, filas restantes]

for row in column_data:
    for sup in row.find_all('sup'):
        sup.decompose()

    row_cells = row.find_all('td')
    cells = []
    
    # Primero, aplicar valores pendientes por rowspan de filas anteriores
    for col_idx in range(len(headers)):
        if col_idx in pending_rowspans:
            val, remaining = pending_rowspans[col_idx]
            cells.append(val)
            pending_rowspans[col_idx][1] -= 1
            if pending_rowspans[col_idx][1] == 0:
                del pending_rowspans[col_idx]
        else:
            # Si hay una celda nueva para esta columna
            if row_cells:
                td = row_cells.pop(0)
                text = td.get_text(strip=True)
                cells.append(text)
                
                # Si la celda tiene rowspan, guardamos para futuras filas
                if td.has_attr('rowspan'):
                    pending_rowspans[col_idx] = [text, int(td['rowspan']) - 1]
            else:
                cells.append('')  # si no hay más celdas, rellenar vacío
    
    # Manejo del ranking
    if cells and cells[0] != '' and cells[0].isdigit():
        last_rank = cells[0]
    elif cells:
        cells[0] = last_rank  # reutilizar último ranking si está vacío
    
    # Si la fila tiene menos columnas, rellenar
    while len(cells) < len(headers):
        cells.append('')
    
    all_rows.append(cells)


In [20]:
#for r in all_rows:
    #print(r)

In [17]:
df = pd.DataFrame(all_rows, columns=headers)

df

Unnamed: 0,No.,Series title,Started broadcasting,Finished broadcasting,Episode count,Runtime
0,,,,,,
1,1,Sazae-san,"October 5, 1969",Currently in production,2771,20–26 minutes
2,2,Nintama Rantarō,"April 10, 1993",Currently in production,2504,10 minutes
3,3,Ojarumaru,"October 5, 1998",Currently in production,2127,10 minutes
4,4,Oyako Club,"October 3, 1994","March 30, 2013",1818,5 minutes
...,...,...,...,...,...,...
174,118,Aikatsu Stars!,"April 7, 2016","March 29, 2018",100,20–26 minutes
175,118,Coji-Coji,"October 4, 1997","September 25, 1999",100,20–26 minutes
176,118,Dragon Quest: The Adventure of Dai(2020),"October 3, 2020","October 22, 2022",100,20–26 minutes
177,118,GeGeGe no Kitarō(2007),"April 1, 2007","March 29, 2009",100,20–26 minutes


In [19]:
df.to_csv(r'C:\Users\linkm\Documents\Python\Web_Scraping\Animes.csv',index = False)