In [2]:
import pandas as pd 
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


## Create driver


In [3]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
time.sleep(5)
page_url = 'https://you-zitsu.fandom.com/wiki/Category:Characters'
driver.get(page_url)

In [4]:
from selenium.webdriver.common.by import By
driver.find_element(By.XPATH, '//div[text()="ACCEPT"]').click()


## Getting the characters from the website


In [5]:
characters = driver.find_elements(By.CLASS_NAME, 'category-page__member-link')
character_list=[]
for character in characters:
    character_list.append({'character': character.text})

    
pd.set_option("display.max_rows", None)    
japanese_character_df = pd.DataFrame(character_list)
japanese_character_df

Unnamed: 0,character
0,Airi Sakura
1,Akane Tachibana
2,Akito Miyake
3,Albert Yamada
4,Arisu Sakayanagi
5,Chairman Sakayanagi
6,Chiaki Matsushita
7,Chie Hoshinomiya
8,Chihiro Shiranami
9,Daichi Ishizaki


## Cleaning data


In [6]:
def cleaner(df):
    df['character reverse'] = df['character'].apply(lambda x: " ".join(reversed(x.split())))

    df['firstname'] = df['character'].apply(lambda x: x.split(' ')[0])
    df['lastname'] = df['character'].apply(lambda x: x.split(' ')[1])

    # having the same last name leads to problems so for the non main characters who have repeating last names we change them

    horikitas_df = df[df['lastname'] == 'Horikita']
    ayanos_df = df[df['lastname'] == 'Ayanokouji']
    japanese_ayanos_df = df[df['lastname'] == 'Ayanokōji']

    horikita_index = horikitas_df.index[horikitas_df['firstname'] == 'Manabu'].tolist()
    ayano_index = ayanos_df.index[ayanos_df['firstname'] == 'Atsuomi'].tolist()
    japanese_ayanos_index = japanese_ayanos_df.index[japanese_ayanos_df['firstname'] == 'Atsuomi'].tolist()

    df['lastname'][horikita_index] = 'None'
    df['lastname'][ayano_index] = 'Impossible'
    df['lastname'][japanese_ayanos_index] = 'Impossible'

In [9]:

pattern = "Category"
filterMask = japanese_character_df['character'].str.contains(pattern)
japanese_character_df = japanese_character_df[~filterMask].copy()

character_df = pd.DataFrame()

# Tsukishiro needs to have firstname Tokinari
# Professor Ayanokouji needs to have firstname Atsuomi
# Sae has a different last name in our transalated books

japanese_character_df['character'][77] = 'Tokinari Tsukishiro'
japanese_character_df['character'][51] = 'Atsuomi Ayanokōji'
japanese_character_df['character'][58] = 'Sae Chiyabashira'


character_df['character'] = (japanese_character_df['character'].apply(lambda x: x.replace('ō', 'ou'))).copy().apply(lambda x: x.replace('ū', 'uu')).copy()

cleaner(japanese_character_df)
cleaner(character_df)

character_df['display name'] = character_df['character']
japanese_character_df['display name'] = (japanese_character_df['character'].apply(lambda x: x.replace('ō', 'ou'))).copy().apply(lambda x: x.replace('ū', 'uu')).copy()
character_df

Unnamed: 0,character,character reverse,firstname,lastname,display name
0,Airi Sakura,Sakura Airi,Airi,Sakura,Airi Sakura
1,Akane Tachibana,Tachibana Akane,Akane,Tachibana,Akane Tachibana
2,Akito Miyake,Miyake Akito,Akito,Miyake,Akito Miyake
3,Albert Yamada,Yamada Albert,Albert,Yamada,Albert Yamada
4,Arisu Sakayanagi,Sakayanagi Arisu,Arisu,Sakayanagi,Arisu Sakayanagi
5,Chairman Sakayanagi,Sakayanagi Chairman,Chairman,Sakayanagi,Chairman Sakayanagi
6,Chiaki Matsushita,Matsushita Chiaki,Chiaki,Matsushita,Chiaki Matsushita
7,Chie Hoshinomiya,Hoshinomiya Chie,Chie,Hoshinomiya,Chie Hoshinomiya
8,Chihiro Shiranami,Shiranami Chihiro,Chihiro,Shiranami,Chihiro Shiranami
9,Daichi Ishizaki,Ishizaki Daichi,Daichi,Ishizaki,Daichi Ishizaki


## Saving to csv


In [10]:
character_df.to_csv('characters.csv')
japanese_character_df.to_csv('japanese characters.csv')