In [1]:
import wikipedia
from bs4 import BeautifulSoup, SoupStrainer
import httplib2
from tqdm import tqdm

# Originally, this was required to implement the 'built-in search'.
# It's no longer really required, but leaving it in because WikiMedia.
wikipedia.set_rate_limiting(True)

#### The wikipedia article "Category: Lists of anime episodes" has *exactly* what I need. It contains, in order, all the anime wikipedia has lists of episodes for. Link: https://en.wikipedia.org/wiki/Category:Lists_of_anime_episodes

In [2]:
main_page = wikipedia.page("Category:Lists of anime episodes")

In [3]:
main_page.title, main_page.pageid

('Category:Lists of anime episodes', '20148056')

#### Retrieve the HTML for this page. The HTML will have the links to all the anime sub-sections, i.e., alphabetically organized blocks, as shown below.

In [4]:
html = main_page.html()

In [5]:
alphabet_links = list()
for link in tqdm(BeautifulSoup(html, parse_only=SoupStrainer('a'))):
    if link.has_attr('href'):
        alphabet_links.append(link['href'])

100%|██████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:00<?, ?it/s]


In [6]:
alphabet_links[0]

'https://en.wikipedia.org/wiki/Category:Lists_of_anime_episodes'

#### The first link retrieved here is actually the link to `main_page`, so we will be discarding it.

In [7]:
alphabet_links = alphabet_links[1:]
alphabet_links

['https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=0',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=A',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=B',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=C',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=D',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=E',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=F',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=G',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=H',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=I',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=J',
 'https://en.wikipedia.org/w/index.php?titl

In [8]:
alphabet_links

['https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=0',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=A',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=B',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=C',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=D',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=E',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=F',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=G',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=H',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=I',
 'https://en.wikipedia.org/w/index.php?title=Category:Lists_of_anime_episodes&from=J',
 'https://en.wikipedia.org/w/index.php?titl

### So now, we have all the links we need. For each of those pages, we will retrieve the links too all the Anime present.

Unfortunately for us, some of the anchors, don't have valid `hrefs`, so we have to write a check for them.

In [9]:
anime_links = list()

for link in tqdm(alphabet_links):
    http = httplib2.Http()
    status, response = http.request(link)
    soup = BeautifulSoup(response, parse_only=SoupStrainer('a'))
    single_anime_links = soup.find_all("a")
    for single_link in single_anime_links:
        if single_link.has_attr('href'):
            anime_links.append(single_link['href'])

100%|██████████████████████████████████████████████████████████████████████████████████| 27/27 [01:05<00:00,  2.41s/it]


In [10]:
anime_links = list(filter(lambda x : x.startswith('/wiki/List_of_'), anime_links))

We have a lot of duplicates too, because some of the letters overlap in the links we got from wikipedia.

In [11]:
sorted(anime_links)[:20]

['/wiki/List_of_%C5%8Cban_Star-Racers_episodes',
 '/wiki/List_of_%C5%8Cban_Star-Racers_episodes',
 '/wiki/List_of_%C5%8Cban_Star-Racers_episodes',
 '/wiki/List_of_%C5%8Cban_Star-Racers_episodes',
 '/wiki/List_of_%C5%8Cban_Star-Racers_episodes',
 '/wiki/List_of_%C5%8Cban_Star-Racers_episodes',
 '/wiki/List_of_%C5%8Cban_Star-Racers_episodes',
 '/wiki/List_of_%C5%8Ckami_Kakushi_episodes',
 '/wiki/List_of_%C5%8Ckami_Kakushi_episodes',
 '/wiki/List_of_%C5%8Ckami_Kakushi_episodes',
 '/wiki/List_of_%C5%8Ckami_Kakushi_episodes',
 '/wiki/List_of_%C5%8Ckami_Kakushi_episodes',
 '/wiki/List_of_%C5%8Ckami_Kakushi_episodes',
 '/wiki/List_of_.hack//Legend_of_the_Twilight_episodes',
 '/wiki/List_of_.hack//Legend_of_the_Twilight_episodes',
 '/wiki/List_of_.hack//Legend_of_the_Twilight_episodes',
 '/wiki/List_of_.hack//Legend_of_the_Twilight_episodes',
 '/wiki/List_of_.hack//Legend_of_the_Twilight_episodes',
 '/wiki/List_of_.hack//Legend_of_the_Twilight_episodes',
 '/wiki/List_of_.hack//Sign_episodes']

#### Get rid of the duplicates

In [12]:
anime_links = list(set(anime_links))

In [13]:
len(anime_links)

842

#### 842 is not much, but realistically speaking, we won't be missing out on the ones we don't have. Anyway, we'll just proceed with the ones we got so far. If something comes up, well, #TODO:

In [14]:
sorted(anime_links)[:10]

['/wiki/List_of_%C5%8Cban_Star-Racers_episodes',
 '/wiki/List_of_%C5%8Ckami_Kakushi_episodes',
 '/wiki/List_of_.hack//Legend_of_the_Twilight_episodes',
 '/wiki/List_of_.hack//Sign_episodes',
 '/wiki/List_of_07-Ghost_episodes',
 '/wiki/List_of_3000_Leagues_in_Search_of_Mother_episodes',
 '/wiki/List_of_AKB0048_episodes',
 '/wiki/List_of_A_Certain_Magical_Index_episodes',
 '/wiki/List_of_A_Certain_Scientific_Railgun_episodes',
 '/wiki/List_of_A_Channel_episodes']

Next, we need to get the `tables` from each of these anime episode lists. These tables contain data, season-wise. Some have episode names for OVAs and specials too, so that will be useful as well.

In [None]:
all_anime_tables = list()
total = len(anime_links)
i = 0
for link in anime_links:
    i = i + 1
    status, response = http.request('https://en.wikipedia.org' + link)
    tables_for_current_link = list()
    
    # Check if each table we find, has the wikitable class. The wikitable class is used for episode tables.
    for table in BeautifulSoup(response, 'html.parser', parse_only=SoupStrainer('table')):
        try:
            if  "wikitable" in str(table['class']):
                tables_for_current_link.append(table)
        except:
            print("Something went wrong with https://en.wikipedia.org%s" % link)
    print("%d/%d, Found %d tables for https://en.wikipedia.org%s" % (i, total, len(tables_for_current_link), link))
    all_anime_tables.append(tables_for_current_link)

1/842, Found 4 tables for https://en.wikipedia.org/wiki/List_of_Basquash!_episodes
2/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_Nekopara_episodes
3/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_Maria_Holic_episodes
4/842, Found 4 tables for https://en.wikipedia.org/wiki/List_of_Tamayura_episodes
5/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Nadia:_The_Secret_of_Blue_Water_episodes
6/842, Found 5 tables for https://en.wikipedia.org/wiki/List_of_The_Devil_Is_a_Part-Timer!_episodes
7/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_Mobile_Suit_Gundam_Unicorn_episodes
8/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_Tokyo_Majin_episodes
9/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Panty_%26_Stocking_with_Garterbelt_episodes
10/842, Found 4 tables for https://en.wikipedia.org/wiki/List_of_Kyo_Kara_Maoh!_episodes
11/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_Black_Rock_Shoot

91/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_Steins;Gate_episodes
92/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Psychic_Detective_Yakumo_episodes
93/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_The_Earl_and_the_Fairy_episodes
94/842, Found 3 tables for https://en.wikipedia.org/wiki/List_of_A_Channel_episodes
95/842, Found 4 tables for https://en.wikipedia.org/wiki/List_of_Steel_Angel_Kurumi_episodes
96/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Linebarrels_of_Iron_episodes
97/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Eureka_Seven_episodes
98/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Crush_Gear_Turbo_episodes
99/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Rewrite_episodes
100/842, Found 9 tables for https://en.wikipedia.org/wiki/List_of_Hidamari_Sketch_episodes
101/842, Found 3 tables for https://en.wikipedia.org/wiki/List_of_Cyborg_009_media
102/842, Foun

181/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Heroic_Age_episodes
182/842, Found 4 tables for https://en.wikipedia.org/wiki/List_of_Magi:_The_Labyrinth_of_Magic_episodes
183/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_Maple_Town_episodes
184/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Monkey_Magic_TV_episodes
185/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Hyperdimension_Neptunia:_The_Animation_episodes
186/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_S-CRY-ed_episodes
187/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Starship_Operators_episodes
188/842, Found 4 tables for https://en.wikipedia.org/wiki/List_of_Kimagure_Orange_Road_episodes
189/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_March_Comes_in_Like_a_Lion_episodes
190/842, Found 13 tables for https://en.wikipedia.org/wiki/List_of_Yu-Gi-Oh!_episodes
191/842, Found 1 tables for https://en.wikipedia.org/wiki

269/842, Found 4 tables for https://en.wikipedia.org/wiki/List_of_Star-Myu_episodes
270/842, Found 3 tables for https://en.wikipedia.org/wiki/List_of_Hajime_no_Ippo_episodes
271/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Astro_Boy_(1980_TV_series)_episodes
272/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Samurai_7_episodes
273/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Ceres,_Celestial_Legend_episodes
274/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Full_Metal_Panic!_Invisible_Victory_episodes
275/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Assassins_Pride_episodes
276/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Shaman_King_episodes
277/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Tribe_Cool_Crew_episodes
278/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Casshern_Sins_episodes
279/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Princ

358/842, Found 3 tables for https://en.wikipedia.org/wiki/List_of_Yumeiro_Patissiere_episodes
359/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_Kino%27s_Journey_episodes
360/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Rescueman_episodes
361/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_%C5%8Ckami_Kakushi_episodes
362/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Maburaho_episodes
363/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Nichijou_episodes
364/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_Fate/stay_night:_Unlimited_Blade_Works_episodes
365/842, Found 4 tables for https://en.wikipedia.org/wiki/List_of_The_World_God_Only_Knows_episodes
366/842, Found 4 tables for https://en.wikipedia.org/wiki/List_of_Yu-Gi-Oh!_Arc-V_episodes
367/842, Found 3 tables for https://en.wikipedia.org/wiki/List_of_Yuri_on_Ice_episodes
368/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Blue_Drop

448/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_Cobra_the_Animation_episodes
449/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_Elfen_Lied_episodes
450/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Saikano_episodes
451/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Pandora_Hearts_episodes
452/842, Found 3 tables for https://en.wikipedia.org/wiki/List_of_Durarara!!%C3%972_episodes
453/842, Found 5 tables for https://en.wikipedia.org/wiki/List_of_Ikki_Tousen_episodes
454/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_A_Little_Snow_Fairy_Sugar_episodes
455/842, Found 3 tables for https://en.wikipedia.org/wiki/List_of_Recently,_My_Sister_is_Unusual_episodes
456/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Majin_Bone_episodes
457/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Modern_Magic_Made_Simple_episodes
458/842, Found 3 tables for https://en.wikipedia.org/wiki/List_of_Buso_Re

537/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_Aria_the_Scarlet_Ammo_episodes
538/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_Love_Live!_Sunshine!!_episodes
539/842, Found 16 tables for https://en.wikipedia.org/wiki/List_of_Monogatari_episodes
540/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_Viewtiful_Joe_episodes
541/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Pani_Poni_Dash!_episodes
542/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_Beelzebub_episodes
543/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Angel_Heart_episodes
544/842, Found 3 tables for https://en.wikipedia.org/wiki/List_of_Penguindrum_episodes
545/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_B-Daman_Fireblast_episodes
546/842, Found 5 tables for https://en.wikipedia.org/wiki/List_of_Gokusen_episodes
547/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_Chobits_episodes
548/842, Found 7 tabl

624/842, Found 4 tables for https://en.wikipedia.org/wiki/List_of_My-Otome_episodes
625/842, Found 4 tables for https://en.wikipedia.org/wiki/List_of_Boogiepop_media
626/842, Found 5 tables for https://en.wikipedia.org/wiki/List_of_K-On!_episodes
627/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Robotics;Notes_episodes
628/842, Found 7 tables for https://en.wikipedia.org/wiki/List_of_Minami-ke_episodes
629/842, Found 4 tables for https://en.wikipedia.org/wiki/List_of_Seitokai_Yakuindomo_episodes
630/842, Found 4 tables for https://en.wikipedia.org/wiki/List_of_Dog_Days_episodes
631/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Future_Boy_Conan_episodes
632/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Angels_of_Death_episodes
633/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Fate/stay_night_episodes
634/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Maoyu_episodes
635/842, Found 6 tables for https://en.wikip

713/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Harukanaru_Toki_no_Naka_de_Hachiy%C5%8D_Sh%C5%8D_episodes
714/842, Found 4 tables for https://en.wikipedia.org/wiki/List_of_Kamisama_Minarai:_Himitsu_no_Cocotama_episodes
715/842, Found 3 tables for https://en.wikipedia.org/wiki/List_of_Kiteretsu_episodes
716/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Welcome_to_the_N.H.K._episodes
717/842, Found 3 tables for https://en.wikipedia.org/wiki/List_of_One-Punch_Man_episodes
718/842, Found 13 tables for https://en.wikipedia.org/wiki/List_of_Dragon_Ball_Z_Kai_episodes
719/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_The_Big_O_episodes
720/842, Found 5 tables for https://en.wikipedia.org/wiki/List_of_A_Certain_Magical_Index_episodes
721/842, Found 5 tables for https://en.wikipedia.org/wiki/List_of_Nisekoi_episodes
722/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_Dagashi_Kashi_episodes
723/842, Found 2 tables for https://en.wik

801/842, Found 12 tables for https://en.wikipedia.org/wiki/List_of_Astro_Boy_(1963_TV_series)_episodes
802/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Yu-Gi-Oh!_Sevens_episodes
803/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_The_Promised_Neverland_episodes
804/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Steins;Gate_0_episodes
805/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Penguin_Musume_Heart_episodes
806/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Yumeria_episodes
807/842, Found 5 tables for https://en.wikipedia.org/wiki/List_of_Last_Exile_episodes
808/842, Found 5 tables for https://en.wikipedia.org/wiki/List_of_The_Laughing_Salesman_episodes
809/842, Found 1 tables for https://en.wikipedia.org/wiki/List_of_Hoshizora_e_Kakaru_Hashi_episodes
810/842, Found 2 tables for https://en.wikipedia.org/wiki/List_of_Phantom_Thief_Jeanne_episodes
811/842, Found 1 tables for https://en.wikipedia.org/wiki/Li

I also want a clean list of all the titles so, we'll be using this:

In [None]:
anime_titles = list()
i = 0
for link in tqdm(anime_links):
    print(i, end="\r")
    i = i + 1
    status, response = http.request('https://en.wikipedia.org' + link)
    anime_titles.append(BeautifulSoup(response).title.string[8:].split(' episodes')[0])

In [None]:
len(anime_titles)

#### So now, we have a list of `bs4 Tags`, with each tag containing the table for each anime episode, per anime title. Great.

The next step is extracting the following pieces of info.
1. Episode Number
2. Episode Title

Both of these are available in the `table` element with `class="vevent"`. That's just how wikipedia decided to keep it.

Although there is more information, such as episode summary and air dates, we won't be using them... for now... maybe in the future? #TODO:

So, each `table` has a `list` of `vevents` which contains the info we need. So what we're gonna do now is, extract this for each of the tables for each of the anime...

Esentially, a few layers deep:
- anime -> tables -> vevents

I'm gonna start doing stuff in pandas now, because it's much easier for me to visualize what's going on.

In [None]:
import pandas as pd

In [None]:
anime_dataset = pd.DataFrame(zip(anime_titles, anime_links), columns=["title", "links"])

In [None]:
anime_dataset.head()

So, there is one problem so far. Wikipedia is inconsistent... VERY UNFORTUNATELY. But given that it's accurate for *most* cases. I will let this pass for now. This **should** suffice for now, but once again, #TODO:

In [None]:
# anime_dataset_final = pd.DataFrame(columns=["series", "season|section", "episode_number", "episode_title"])

In [None]:
dataframe_rows = list()

In [None]:
for anime_title, anime in tqdm(zip(anime_titles, all_anime_tables)):
    # print(anime_title, end="\r")
    table_number = 0
    for table in anime:
        table_number += 1
        for row in table.findAll("tr", {"class": "vevent"}):
            # print(row.find("td", {"class": "summary"}).contents)
            episode_number = row.contents[0].contents[0]
            episode_title_english = row.contents[1].contents[0]
            # episode_title_transliteration = row.contents[1].contents[3].contents[0]
            dataframe_rows.append({
                "series": anime_title,
                "seasonORsection": table_number,
                "episode_number": episode_number,
                "episode_title": episode_title_english
            })
            #print("%s %d - %s: %s" % (anime_title, table_number, episode_number, episode_title_english))

In [None]:
anime_dataset_final = pd.DataFrame(dataframe_rows)

In [None]:
anime_dataset_final.to_csv('anime_episodes.csv')

# And it's over...