In [5]:
import pandas as pd 
import torch
import numpy as np
from bs4 import BeautifulSoup
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from torch import nn
from sklearn.metrics import classification_report
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
import evaluate
from datasets import Dataset


## 1. Data prepration 

In [7]:
model_name = "distilbert-base-uncased"


device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [8]:
data_path = "C:/Users/Wen/OneDrive/Documents/GitHub/Portfolio_Project/Deep Learning/Shinobi Naruto Insights -NLP/Web Scraping/jutsu.jsonl"

df = pd.read_json(data_path, lines=True)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...
1,Avalanche Dance,"Taijutsu, Shurikenjutsu","Haku delivers a string of kicks, punches, and ..."
2,Attack of the Twin Demons,"Kekkei Genkai, Ninjutsu",This technique allows Ukon to inhabit his brot...
3,Bando's Chakra Armour,Ninjutsu,Bando activates his chakra to surround him in ...
4,Ball of Light Technique,Ninjutsu,The user gathers a ball of concentrated light ...


In [9]:
def jutsu_types(jutsu):
    if "Taijutsu" in jutsu:
        return "Taijutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    
    return None

df["3_types_of_jutsu"] = df["jutsu_type"].apply(jutsu_types)
df["3_types_of_jutsu"].value_counts()

3_types_of_jutsu
Ninjutsu    2009
Taijutsu     628
Genjutsu      79
Name: count, dtype: int64

In [10]:
df.columns

Index(['jutsu_name', 'jutsu_type', 'jutsu_description', '3_types_of_jutsu'], dtype='object')

In [11]:
df['text'] = df['jutsu_name']+ '. '+df['jutsu_description']
df['jutsu'] = df['3_types_of_jutsu']
df= df[['text','jutsu']].dropna()
print(df)

                                                   text     jutsu
0     10 Hit Combo. Lars punches the opponent before...  Taijutsu
1     Avalanche Dance. Haku delivers a string of kic...  Taijutsu
2     Attack of the Twin Demons. This technique allo...  Ninjutsu
3     Bando's Chakra Armour. Bando activates his cha...  Ninjutsu
4     Ball of Light Technique. The user gathers a ba...  Ninjutsu
...                                                 ...       ...
2867  Absolute: Fang Passing Fang. Kiba and Akamaru ...  Taijutsu
2868  16 Hit Combo. A very effective move, Ino uses ...  Taijutsu
2869  1000 Metre Punch. The user focuses a large amo...  Taijutsu
2870  100% Single Punch. Tsunade gathers large amoun...  Taijutsu
2871  100 Metre Punch. A shorter version of the 1000...  Taijutsu

[2716 rows x 2 columns]


### 1.1 Clean Data 

In [12]:
def put_line_breaks(self, text):
    text = text.replace('\n', ' ')
    return text

In [13]:
class Cleaner():
    def __init__(self):
        pass
    
    def remove_html_tags(self, text):
        """Remove HTML tags using BeautifulSoup."""
        cleantext = BeautifulSoup(text, "lxml").text
        return cleantext
    
    def clean(self, text):
        """Clean text by removing HTML tags and standardizing white space."""
        text = self.remove_html_tags(text)
        text = ' '.join(text.split())
        return text


In [14]:
cleaned_data = Cleaner()
df['text_cleaned'] = df["text"].apply(cleaned_data.clean)
print(df['text_cleaned'])

  cleantext = BeautifulSoup(text, "lxml").text


0       10 Hit Combo. Lars punches the opponent before...
1       Avalanche Dance. Haku delivers a string of kic...
2       Attack of the Twin Demons. This technique allo...
3       Bando's Chakra Armour. Bando activates his cha...
4       Ball of Light Technique. The user gathers a ba...
                              ...                        
2867    Absolute: Fang Passing Fang. Kiba and Akamaru ...
2868    16 Hit Combo. A very effective move, Ino uses ...
2869    1000 Metre Punch. The user focuses a large amo...
2870    100% Single Punch. Tsunade gathers large amoun...
2871    100 Metre Punch. A shorter version of the 1000...
Name: text_cleaned, Length: 2716, dtype: object


### 1.2 Label Encoder

In [15]:
encoder = LabelEncoder()
encoder.fit(df["jutsu"].tolist())
df['label'] = encoder.transform(df["jutsu"].tolist())
df.head()


Unnamed: 0,text,jutsu,text_cleaned,label
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu,10 Hit Combo. Lars punches the opponent before...,2
1,Avalanche Dance. Haku delivers a string of kic...,Taijutsu,Avalanche Dance. Haku delivers a string of kic...,2
2,Attack of the Twin Demons. This technique allo...,Ninjutsu,Attack of the Twin Demons. This technique allo...,1
3,Bando's Chakra Armour. Bando activates his cha...,Ninjutsu,Bando's Chakra Armour. Bando activates his cha...,1
4,Ball of Light Technique. The user gathers a ba...,Ninjutsu,Ball of Light Technique. The user gathers a ba...,1


### 1.3 Class Weights

In [16]:
class_weights = compute_class_weight('balanced',
                     classes=sorted(df['label'].unique().tolist()),
                     y=df['label'].tolist()).tolist()

print(class_weights)

[11.459915611814345, 0.45063879210220675, 1.4416135881104033]


### 1.4 Train Testing Split

In [17]:
df_train,df_test = train_test_split(df,test_size=0.2,stratify=df['label'])


### 1.5 Convert to HuggingFace Datast

In [18]:
train_data = Dataset.from_pandas(df_train)
test_data = Dataset.from_pandas(df_test)

### 1.6 Tokenizer

In [19]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text_cleaned"], truncation=True)

tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)





Map:   0%|          | 0/2172 [00:00<?, ? examples/s]

Map:   0%|          | 0/544 [00:00<?, ? examples/s]

## 2. Initialize Model

In [20]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 3. Train Model 

In [21]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [22]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device=device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [23]:

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy = "epoch",
    logging_strategy="epoch"
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/1360 [00:00<?, ?it/s]

{'loss': 1.1086, 'grad_norm': 5.426770210266113, 'learning_rate': 0.00016, 'epoch': 1.0}


  0%|          | 0/68 [00:00<?, ?it/s]

{'eval_loss': 1.0859078168869019, 'eval_accuracy': 0.6433823529411765, 'eval_runtime': 85.8532, 'eval_samples_per_second': 6.336, 'eval_steps_per_second': 0.792, 'epoch': 1.0}
{'loss': 1.0699, 'grad_norm': 2.3903634548187256, 'learning_rate': 0.00012, 'epoch': 2.0}


  0%|          | 0/68 [00:00<?, ?it/s]

{'eval_loss': 1.0219625234603882, 'eval_accuracy': 0.5091911764705882, 'eval_runtime': 68.82, 'eval_samples_per_second': 7.905, 'eval_steps_per_second': 0.988, 'epoch': 2.0}
{'loss': 1.0212, 'grad_norm': 9.947807312011719, 'learning_rate': 8e-05, 'epoch': 3.0}


  0%|          | 0/68 [00:00<?, ?it/s]

{'eval_loss': 1.2136396169662476, 'eval_accuracy': 0.23161764705882354, 'eval_runtime': 72.7804, 'eval_samples_per_second': 7.475, 'eval_steps_per_second': 0.934, 'epoch': 3.0}
{'loss': 1.1019, 'grad_norm': 12.017338752746582, 'learning_rate': 4e-05, 'epoch': 4.0}


  0%|          | 0/68 [00:00<?, ?it/s]

{'eval_loss': 1.0760619640350342, 'eval_accuracy': 0.7591911764705882, 'eval_runtime': 78.1538, 'eval_samples_per_second': 6.961, 'eval_steps_per_second': 0.87, 'epoch': 4.0}
{'loss': 0.9576, 'grad_norm': 7.100297451019287, 'learning_rate': 0.0, 'epoch': 5.0}


  0%|          | 0/68 [00:00<?, ?it/s]

{'eval_loss': 0.982421875, 'eval_accuracy': 0.7591911764705882, 'eval_runtime': 102.9114, 'eval_samples_per_second': 5.286, 'eval_steps_per_second': 0.661, 'epoch': 5.0}
{'train_runtime': 5717.3386, 'train_samples_per_second': 1.899, 'train_steps_per_second': 0.238, 'train_loss': 1.0518485798555262, 'epoch': 5.0}


TrainOutput(global_step=1360, training_loss=1.0518485798555262, metrics={'train_runtime': 5717.3386, 'train_samples_per_second': 1.899, 'train_steps_per_second': 0.238, 'train_loss': 1.0518485798555262, 'epoch': 5.0})

In [24]:
trainer.save_model('jutsu_model')

## 4. Evaluate Model 

In [25]:
preds = trainer.predict(tokenized_train)
preds = np.argmax(preds[:3][0],axis=1)
actual = df_train['label'].tolist()
print(classification_report(actual,preds))

  0%|          | 0/272 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        63
           1       0.87      0.84      0.85      1607
           2       0.56      0.69      0.62       502

    accuracy                           0.78      2172
   macro avg       0.48      0.51      0.49      2172
weighted avg       0.77      0.78      0.77      2172



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
preds = trainer.predict(tokenized_test)
preds = np.argmax(preds[:3][0],axis=1)
actual = df_test['label'].tolist()
print(classification_report(actual,preds))

  0%|          | 0/68 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.85      0.84      0.84       402
           2       0.52      0.61      0.56       126

    accuracy                           0.76       544
   macro avg       0.46      0.48      0.47       544
weighted avg       0.75      0.76      0.75       544



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
