# Load Dataset

In [45]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset

In [13]:
data_path = "../data/jutsu.jsonl"
df = pd.read_json(data_path, lines=True, encoding='utf-8')
df.head(10)

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...
1,Armour of Sticky Gold,"Hiden, Ninjutsu",Kidōmaru secretes Spider Sticky Gold from his ...
2,Armour of Sand,Ninjutsu,"This technique is the second part of Gaara's ""..."
3,Armour-Eater,Ninjutsu,"The user melts pieces of their armour, creatin..."
4,100% Single Punch,Taijutsu,Tsunade gathers large amounts of chakra in her...
5,100 Metre Punch,Taijutsu,"A shorter version of the 1000 Metre Punch, the..."
6,Art (jutsu),"Kekkei Genkai, Hiden, Ninjutsu",This article is about the jutsu from Naruto Sh...
7,1000 Metre Punch,Taijutsu,The user focuses a large amount of chakra into...
8,Arm of Shukaku,Ninjutsu,Gaara engulfs himself in a sand barrier as Shu...
9,Armageddon Countdown Clock,Ninjutsu,"After performing the hand seal, a special cloc..."


In [14]:
def simplify_jutsu(jutsu):
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    if "Taijutsu" in jutsu:
        return "Taijutsu"

In [15]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu)
df.head(10)

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...,Taijutsu
1,Armour of Sticky Gold,"Hiden, Ninjutsu",Kidōmaru secretes Spider Sticky Gold from his ...,Ninjutsu
2,Armour of Sand,Ninjutsu,"This technique is the second part of Gaara's ""...",Ninjutsu
3,Armour-Eater,Ninjutsu,"The user melts pieces of their armour, creatin...",Ninjutsu
4,100% Single Punch,Taijutsu,Tsunade gathers large amounts of chakra in her...,Taijutsu
5,100 Metre Punch,Taijutsu,"A shorter version of the 1000 Metre Punch, the...",Taijutsu
6,Art (jutsu),"Kekkei Genkai, Hiden, Ninjutsu",This article is about the jutsu from Naruto Sh...,Ninjutsu
7,1000 Metre Punch,Taijutsu,The user focuses a large amount of chakra into...,Taijutsu
8,Arm of Shukaku,Ninjutsu,Gaara engulfs himself in a sand barrier as Shu...,Ninjutsu
9,Armageddon Countdown Clock,Ninjutsu,"After performing the hand seal, a special cloc...",Ninjutsu


In [16]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    2258
Taijutsu     398
Genjutsu     101
Name: count, dtype: int64

In [17]:
df['text'] = df['jutsu_name'] + "." + df['jutsu_description']
df['jutsu'] = df['jutsu_type_simplified']
df = df[['text', 'jutsu']]
df.dropna(inplace=True)

In [18]:
df.head(10)

Unnamed: 0,text,jutsu
0,10 Hit Combo.Lars punches the opponent before ...,Taijutsu
1,Armour of Sticky Gold.Kidōmaru secretes Spider...,Ninjutsu
2,Armour of Sand.This technique is the second pa...,Ninjutsu
3,Armour-Eater.The user melts pieces of their ar...,Ninjutsu
4,100% Single Punch.Tsunade gathers large amount...,Taijutsu
5,100 Metre Punch.A shorter version of the 1000 ...,Taijutsu
6,Art (jutsu).This article is about the jutsu fr...,Ninjutsu
7,1000 Metre Punch.The user focuses a large amou...,Taijutsu
8,Arm of Shukaku.Gaara engulfs himself in a sand...,Ninjutsu
9,Armageddon Countdown Clock.After performing th...,Ninjutsu


The data was scraped from the web so, it could have some junk html tags. we have to clean it

In [19]:
from bs4 import BeautifulSoup

class Cleaner():
    def __init__(self):
        pass

    def put_line_breaks(self, text):
        return text.replace("</p>", "</p>\n")
    
    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, "lxml").text
        return clean_text

    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        return text

In [21]:
text_column_name = 'text'
label_column_name = "jutsu"

In [24]:
# Clean text
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

  clean_text = BeautifulSoup(text, "lxml").text


In [25]:
df

Unnamed: 0,text,jutsu,text_cleaned
0,10 Hit Combo.Lars punches the opponent before ...,Taijutsu,10 Hit Combo.Lars punches the opponent before ...
1,Armour of Sticky Gold.Kidōmaru secretes Spider...,Ninjutsu,Armour of Sticky Gold.Kidōmaru secretes Spider...
2,Armour of Sand.This technique is the second pa...,Ninjutsu,Armour of Sand.This technique is the second pa...
3,Armour-Eater.The user melts pieces of their ar...,Ninjutsu,Armour-Eater.The user melts pieces of their ar...
4,100% Single Punch.Tsunade gathers large amount...,Taijutsu,100% Single Punch.Tsunade gathers large amount...
...,...,...,...
2920,Wrapping Flame Cat Fire.Transforming into Mata...,Taijutsu,Wrapping Flame Cat Fire.Transforming into Mata...
2921,Wood Release: World of Trees Wall.The user cre...,Ninjutsu,Wood Release: World of Trees Wall.The user cre...
2922,Yachihoko no Kami.Kaguya opens numerous small ...,Ninjutsu,Yachihoko no Kami.Kaguya opens numerous small ...
2923,Yin Healing Wound Destruction.Kabuto analyses ...,Ninjutsu,Yin Healing Wound Destruction.Kabuto analyses ...


In [27]:
le = LabelEncoder()
le.fit(df[label_column_name].tolist())

In [29]:
label_dict = { index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [30]:
df['label'] = le.transform(df[label_column_name].tolist())

In [31]:
df

Unnamed: 0,text,jutsu,text_cleaned,label
0,10 Hit Combo.Lars punches the opponent before ...,Taijutsu,10 Hit Combo.Lars punches the opponent before ...,2
1,Armour of Sticky Gold.Kidōmaru secretes Spider...,Ninjutsu,Armour of Sticky Gold.Kidōmaru secretes Spider...,1
2,Armour of Sand.This technique is the second pa...,Ninjutsu,Armour of Sand.This technique is the second pa...,1
3,Armour-Eater.The user melts pieces of their ar...,Ninjutsu,Armour-Eater.The user melts pieces of their ar...,1
4,100% Single Punch.Tsunade gathers large amount...,Taijutsu,100% Single Punch.Tsunade gathers large amount...,2
...,...,...,...,...
2920,Wrapping Flame Cat Fire.Transforming into Mata...,Taijutsu,Wrapping Flame Cat Fire.Transforming into Mata...,2
2921,Wood Release: World of Trees Wall.The user cre...,Ninjutsu,Wood Release: World of Trees Wall.The user cre...,1
2922,Yachihoko no Kami.Kaguya opens numerous small ...,Ninjutsu,Yachihoko no Kami.Kaguya opens numerous small ...,1
2923,Yin Healing Wound Destruction.Kabuto analyses ...,Ninjutsu,Yin Healing Wound Destruction.Kabuto analyses ...,1


In [34]:
df_train, df_test = train_test_split(df, 
                                     test_size=0.2,
                                     stratify=df['label'])

In [35]:
df_train['jutsu'].value_counts()

jutsu
Ninjutsu    1806
Taijutsu     318
Genjutsu      81
Name: count, dtype: int64

In [36]:
model_name = "distilbert/distilbert-base-uncased"

In [38]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [39]:
def preprocess_function(tokenizer, examples):
    return tokenizer(examples['text_cleaned'], truncation=True)

In [46]:
# convert pandas to hugging face dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# tokenize the dataset
tokenized_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched=True)
tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched=True)

Map: 100%|██████████| 2205/2205 [00:00<00:00, 10956.92 examples/s]
Map: 100%|██████████| 552/552 [00:00<00:00, 11752.03 examples/s]
