In [1]:
import pandas as pd
import os
import torch
import torch.cuda
from tqdm.autonotebook import tqdm
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import load_dataset

  from tqdm.autonotebook import tqdm


In [2]:
cwd = os.getcwd()
parent_dir = os.path.abspath(os.path.join(cwd, os.pardir))
data_dir = os.path.join(parent_dir, "data")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
data = pd.read_csv(os.path.join(data_dir, "combined_final.csv"))
tokenizer = BertTokenizer.from_pretrained("Minej/bert-base-personality")
model = BertForSequenceClassification.from_pretrained("Minej/bert-base-personality").to(
    device
)

In [4]:
data.head()

Unnamed: 0,comment_id,author,date,comment,video_id,is_reply,parent_id,channel,genre,emoji
0,Ugz1ByKCNN-zcHnFDmt4AaABAg,Yuri Briceño,2023-11-12T19:03:39Z,It 39 s y good,lDK9QqIzhwk,False,,bonjovi,metal,['❤️']
1,UgzEx14TY3TRXzYiY6F4AaABAg,leslie winchester,2023-11-12T18:12:49Z,2009 2011 six string in hock prayer still w...,lDK9QqIzhwk,False,,bonjovi,metal,[]
2,UgyyXemVneI7UpYbunt4AaABAg,Jeff Packham,2023-11-12T14:25:41Z,no talk about club He man woman club Alfalfa ...,lDK9QqIzhwk,False,,bonjovi,metal,"['🦜', '😎', '😂', '😎', '😂', '❤️', '🎉']"
3,UgxnQPStaiTWQqrwtfl4AaABAg,Reto Kaufmann,2023-11-12T13:37:51Z,Rock and metal radio station it quot shog...,lDK9QqIzhwk,False,,bonjovi,metal,[]
4,UgylFha0XC_D4Amrmb94AaABAg,Thomas Schmutter,2023-11-12T10:35:46Z,From Bon Living on a Prayer,lDK9QqIzhwk,False,,bonjovi,metal,['🎉']


In [5]:
dataset = load_dataset("csv", data_files=os.path.join(data_dir, "combined_final.csv"))
dataset = dataset["train"]

In [6]:
dataset_tokenized = dataset.map(
    lambda examples: tokenizer(
        examples["comment"], truncation=True, padding=True, return_tensors="np"
    ),
    batched=True,
)

Map: 100%|██████████| 2300677/2300677 [07:39<00:00, 5002.10 examples/s]


In [None]:
inputs = dataset_tokenized  # .to(device)
outputs = model(**inputs)
predictions = outputs.logits.squeeze().detach().cpu().numpy()

In [8]:
def personality_detection(text):
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(
        device
    )
    outputs = model(**inputs)
    predictions = outputs.logits.squeeze().detach().cpu().numpy()

    label_names = [
        "Extroversion",
        "Neuroticism",
        "Agreeableness",
        "Conscientiousness",
        "Openness",
    ]
    result = {label_names[i]: predictions[i] for i in range(len(label_names))}

    return result

In [9]:
text_input = "i like dogs"
personality_prediction = personality_detection(text_input)
print(personality_prediction)

{'Extroversion': -0.10676388, 'Neuroticism': 0.23157988, 'Agreeableness': -0.34132114, 'Conscientiousness': -0.9508019, 'Openness': 0.019197056}


In [10]:
tqdm.pandas(total=data.shape[0], desc="Computing personality", colour="magenta")
data["personality"] = data.progress_apply(
    lambda row: personality_detection(row.comment), axis=1
)

Computing personality: 100%|[35m██████████[0m| 2300677/2300677 [5:52:27<00:00, 108.79it/s]  


In [11]:
data.to_json(
    os.path.join(data_dir, "combined_personality.json"),
    orient="records",
    indent=4,
    force_ascii=False,
)

# NER COMPUTING, BATCHED VERSION 

In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger

In [None]:
data = pd.read_csv(os.path.join(data_dir, "combined_final.csv"))
tagger = SequenceTagger.load("flair/ner-english-ontonotes-large").to(device)

In [None]:
tqdm.pandas(total=data.shape[0], desc="Sentence changing", colour="yellow")
data["sentence"] = data.progress_apply(lambda x: Sentence(x.comment), axis=1)

In [None]:
tagger.predict(data["sentence"].tolist(), mini_batch_size=1000, verbose=True)

In [None]:
def get_entities(sentence):
    entities = []
    for entity in sentence.get_spans("ner"):
        entities.append(
            {
                "text": entity.text,
                "type": entity.tag,
            }
        )
    return entities

In [None]:
data["ner"] = data.progress_apply(lambda x: get_entities(x.sentence), axis=1)

In [None]:
data.to_json(
    os.path.join(data_dir, "combined_ner.json"),
    orient="records",
    indent=4,
    force_ascii=False,
)