In [None]:
!pip install torch

In [None]:
!pip install transformers datasets evaluate accelerate

In [None]:
#!pip install ipywidgets==7.7.1
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset

#Use csv dataset

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# %cd /content/drive/My Drive/Colab Notebooks/

In [None]:
import pandas as pd
import numpy as np

np.random.seed(4)

df = pd.read_csv('/kaggle/input/d/phualihjang/oversampling-mbti-df/mbti_JP_oversampling.csv')
df_train, df_test = np.split(df.sample(frac=1, random_state=35),
                                     [int(0.9*len(df))])
print(len(df_train), len(df_test))
df_train_1 = df_train.filter(['JP','post'], axis=1)
df_test_1 = df_test.filter(['JP','post'], axis=1)


df_train_1.to_csv("/kaggle/working/train_kaggle_DF.csv")
df_test_1.to_csv("/kaggle/working/test_kaggle_DF.csv")

In [None]:
df.head()

In [None]:
df_train_1.head()

In [None]:
dataset = load_dataset('csv', data_files={'train': '/kaggle/working/train_kaggle_DF.csv','test': '/kaggle/working/test_kaggle_DF.csv'},)

In [None]:
dataset = dataset.select_columns(['JP', 'post'])
dataset = dataset.rename_column("JP", "labels")
dataset = dataset.rename_column("post", "text")
dataset["train"][1]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-en")

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True, max_length=512)

In [None]:
tokenized_mbti = dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
recall = evaluate.load("recall")
precision = evaluate.load("precision")

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"Acc": accuracy.compute(predictions=predictions, references=labels), "F1": f1.compute(predictions=predictions, references=labels), "Recall": recall.compute(predictions=predictions, references=labels), "Precision": precision.compute(predictions=predictions, references=labels)}

In [None]:
id2label = {0: "P", 1: "J"}
label2id = {"P": 0, "J": 1}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "nghuyong/ernie-2.0-en", num_labels=2, id2label=id2label, label2id=label2id
)

In [None]:
from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="mbti_model_Ernie_JP",
    learning_rate=1e-5,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_mbti["train"],
    eval_dataset=tokenized_mbti["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

trainer.train()

In [None]:
text = "play nice until they realize their diplomatic skills are insufficient to accomplish their goals. "

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model="RichardPhua/mbti_model_for_JP")
classifier(text)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("RichardPhua/mbti_model")
inputs = tokenizer(text, return_tensors="pt")

In [None]:
import torch
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("RichardPhua/mbti_model")
with torch.no_grad():
    logits = model(**inputs).logits

In [None]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

### For others dichomities

In [None]:
#EI
df = pd.read_csv('/kaggle/input/d/phualihjang/oversampling-mbti-df/mbti_EI_oversampling.csv')
df_train, df_test = np.split(df.sample(frac=1, random_state=35),
                                     [int(0.9*len(df))])

df_train_EI = df_train.filter(['EI','post'], axis=1)
df_test_EI = df_test.filter(['EI','post'], axis=1)


df_train_EI.to_csv("/kaggle/working/train_kaggle_EI.csv")
df_test_EI.to_csv("/kaggle/working/test_kaggle_EI.csv")

#SN
df = pd.read_csv('/kaggle/input/d/phualihjang/oversampling-mbti-df/mbti_SN_oversampling.csv')
df_train, df_test = np.split(df.sample(frac=1, random_state=35),
                                     [int(0.9*len(df))])

df_train_SN = df_train.filter(['SN','post'], axis=1)
df_test_SN = df_test.filter(['SN','post'], axis=1)


df_train_SN.to_csv("/kaggle/working/train_kaggle_SN.csv")
df_test_SN.to_csv("/kaggle/working/test_kaggle_SN.csv")

#TF
df = pd.read_csv('/kaggle/input/d/phualihjang/oversampling-mbti-df/mbti_TF_oversampling.csv')
df_train, df_test = np.split(df.sample(frac=1, random_state=35),
                                     [int(0.9*len(df))])

df_train_TF = df_train.filter(['TF','post'], axis=1)
df_test_TF = df_test.filter(['TF','post'], axis=1)


df_train_TF.to_csv("/kaggle/working/train_kaggle_TF.csv")
df_test_TF.to_csv("/kaggle/working/test_kaggle_TF.csv")

In [None]:
dataset_EI = load_dataset('csv', data_files={'train': '/kaggle/working/train_kaggle_EI.csv','test': '/kaggle/working/test_kaggle_EI.csv'})
dataset_SN = load_dataset('csv', data_files={'train': '/kaggle/working/train_kaggle_SN.csv','test': '/kaggle/working/test_kaggle_SN.csv'})
dataset_TF = load_dataset('csv', data_files={'train': '/kaggle/working/train_kaggle_TF.csv','test': '/kaggle/working/test_kaggle_TF.csv'},)

In [None]:
# EI
dataset_EI = dataset_EI.select_columns(['EI', 'post'])
dataset_EI = dataset_EI.rename_column("EI", "labels")
dataset_EI = dataset_EI.rename_column("post", "text")

# SN
dataset_SN = dataset_SN.select_columns(['SN', 'post'])
dataset_SN = dataset_SN.rename_column("SN", "labels")
dataset_SN = dataset_SN.rename_column("post", "text")

# TP
dataset_TF = dataset_TF.select_columns(['TF', 'post'])
dataset_TF = dataset_TF.rename_column("TF", "labels")
dataset_TF = dataset_TF.rename_column("post", "text")

In [None]:
tokenized_mbti_EI = dataset_EI.map(preprocess_function, batched=True)
tokenized_mbti_SN = dataset_SN.map(preprocess_function, batched=True)
tokenized_mbti_TF = dataset_TF.map(preprocess_function, batched=True)

In [None]:
id2label_EI = {0: "I", 1: "E"}
label2id_EI = {"I": 0, "E": 1}

id2label_SN = {0: "N", 1: "S"}
label2id_SN = {"N": 0, "S": 1}

id2label_TF = {0: "F", 1: "T"}
label2id_TF = {"F": 0, "T": 1}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model_EI = AutoModelForSequenceClassification.from_pretrained(
    "nghuyong/ernie-2.0-en", num_labels=2, id2label=id2label_EI, label2id=label2id_EI
)

model_SN = AutoModelForSequenceClassification.from_pretrained(
    "nghuyong/ernie-2.0-en", num_labels=2, id2label=id2label_SN, label2id=label2id_SN
)

model_TF = AutoModelForSequenceClassification.from_pretrained(
    "nghuyong/ernie-2.0-en", num_labels=2, id2label=id2label_TF, label2id=label2id_TF
)

### Model for EI

In [None]:
from transformers import EarlyStoppingCallback
training_args_EI = TrainingArguments(
    output_dir="mbti_model_Ernie_EI",
    learning_rate=1e-5,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model_EI,
    args=training_args_EI,
    train_dataset=tokenized_mbti_EI["train"],
    eval_dataset=tokenized_mbti_EI["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

trainer.train()

### Model of SN

In [None]:
training_args_SN = TrainingArguments(
    output_dir="mbti_model_Ernie_SN",
    learning_rate=1e-5,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model_SN,
    args=training_args_SN,
    train_dataset=tokenized_mbti_SN["train"],
    eval_dataset=tokenized_mbti_SN["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

### Model For TF

In [None]:
from transformers import EarlyStoppingCallback

training_args_TF = TrainingArguments(
    output_dir="mbti_model_Ernie_TF",
    learning_rate=1e-5,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model_TF,
    args=training_args_TF,
    train_dataset=tokenized_mbti_TF["train"],
    eval_dataset=tokenized_mbti_TF["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()