In [1]:
%%capture
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install transformers pandas numpy scikit-learn datasets

In [2]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import shutil
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from datasets import load_dataset, Dataset
from abc import abstractmethod

In [3]:
class Big5DataLoader():
    @abstractmethod
    def load_hf_data(path):
        try:
            hf_dataset = load_dataset(path)
        except Exception as e:
            print(f"Error: {str(e)}")
            return None, None, None

        def change_type(df):
            numerical_columns = ['agreeableness', 'conscientiousness', 'extraversion', 'neuroticism', 'openness']
            for col in numerical_columns:
                if col in df.columns:
                    df[col] = df[col].astype(np.float16)
            return df

        hf_train = hf_dataset['train'].to_pandas().dropna().reset_index(drop=True)
        hf_train = change_type(hf_train)
        hf_validation = hf_dataset['validation'].to_pandas().dropna().reset_index(drop=True)
        hf_validation = change_type(hf_validation)
        hf_test = hf_dataset['test'].to_pandas().dropna().reset_index(drop=True)
        hf_test = change_type(hf_test)

        return hf_train, hf_validation, hf_test

    @abstractmethod
    def load_hf_data_modify(path):
        try:
            hf_dataset = load_dataset(path)
        except Exception as e:
            print(f"Error: {str(e)}")
            return None, None, None

        letter_map = {'text': 'text', 'A': 'agreeableness', 'C': 'conscientiousness', 'E': 'extraversion', 'N': 'neuroticism', 'O': 'openness'}

        def process_split(data_split):
            df = data_split.to_pandas().dropna().reset_index(drop=True)
            columns_to_keep = set(letter_map.keys())
            df = df[df.columns.intersection(columns_to_keep)]
            numerical_columns = ['agreeableness', 'conscientiousness', 'extraversion', 'neuroticism', 'openness']
            for col in numerical_columns:
                if col in df.columns:
                    df[col] = df[col].astype(np.float16)
            return df.rename(columns=letter_map)

        hf_train = process_split(hf_dataset['train'])
        hf_validation = process_split(hf_dataset['validation'])
        hf_test = process_split(hf_dataset['test'])

        return hf_train, hf_validation, hf_test

    @abstractmethod
    def load_local_data(path):
        if not os.path.exists(path):
            print(f"Error: {path} does not exist")
            return None

        root, extension = os.path.splitext(path)
        if extension == '.xlsx':
            local = pd.read_excel(path)
        elif extension == '.csv':
            local = pd.read_csv(path)
        else:
            print("The file is not an Excel or CSV file")
            return None

        local = local.dropna().reset_index(drop=True)

        columns_to_scale = ["extraversion", "neuroticism", "agreeableness",
                            "conscientiousness", "openness"]

        for col in columns_to_scale:
            if col in local.columns:
                local[col] = (((local[col] - 1) / (7 - 1)) * 100).round(0).astype(np.float16)

        return local

In [4]:
hf_train1, hf_validation1, hf_test1 = Big5DataLoader.load_hf_data('Fatima0923/Automated-Personality-Prediction')
hf_train2, hf_validation2, hf_test2 = Big5DataLoader.load_hf_data_modify("jingjietan/pandora-big5")
local_train = Big5DataLoader.load_local_data('/content/big_5_train.xlsx')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/845 [00:00<?, ?B/s]

(…)sonality Datasets - Reddit/train_set.csv:   0%|          | 0.00/4.63M [00:00<?, ?B/s]

(…)ersonality Datasets - Reddit/val_set.csv:   0%|          | 0.00/732k [00:00<?, ?B/s]

(…)rsonality Datasets - Reddit/eval_set.csv:   0%|          | 0.00/663k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16047 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2415 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2415 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/163M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/164M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/81.8M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1924201 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/481051 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/601314 [00:00<?, ? examples/s]

In [5]:
print("Dataset 1 size: ", len(hf_train1), len(hf_validation1), len(hf_test1))
print("Dataset 2 size: ", len(hf_train2), len(hf_validation2), len(hf_test2))
print("Dataset 3 size: ", len(local_train))

def combine_all_datasets(hf_train1, hf_validation1, hf_test1, hf_train2, hf_validation2, hf_test2, local_train):
    train = pd.concat([hf_train1, hf_train2, local_train], ignore_index=True)
    validation = pd.concat([hf_validation1, hf_validation2], ignore_index=True)
    test = pd.concat([hf_test1, hf_test2], ignore_index=True)
    return train, validation, test

train, validation, test = combine_all_datasets(hf_train1, hf_validation1, hf_test1, hf_train2, hf_validation2, hf_test2, local_train)

print("Combined dataset size: ", len(train), len(validation), len(test))
print("Train columns: ", train.columns)
print("Validation columns: ", validation.columns)
print("Test columns: ", test.columns)
print("-"*30)
print("Train head: ", train.head())
print("-"*30)
print("Validation head: ", validation.head())
print("-"*30)
print("Test head: ", test.head())

Dataset 1 size:  16047 2415 2415
Dataset 2 size:  1924201 481051 601314
Dataset 3 size:  783
Combined dataset size:  1941031 483466 603729
Train columns:  Index(['text', 'agreeableness', 'openness', 'conscientiousness',
       'extraversion', 'neuroticism'],
      dtype='object')
Validation columns:  Index(['text', 'agreeableness', 'openness', 'conscientiousness',
       'extraversion', 'neuroticism'],
      dtype='object')
Test columns:  Index(['text', 'agreeableness', 'openness', 'conscientiousness',
       'extraversion', 'neuroticism'],
      dtype='object')
------------------------------
Train head:                                                  text  agreeableness  openness  \
0  his name was kim kimble originally wow thats s...            9.0      61.0   
1  theyre better than the normal posts on ryugioh...           50.0      85.0   
2  how the fuck does this even happen hi youre cu...           15.0      85.0   
3  it probably does ive learned a lot about mysel...           

In [6]:
expected_columns = ["text", "agreeableness", "openness", "conscientiousness", "extraversion", "neuroticism"]
assert set(train.columns) == set(expected_columns), "Mismatch in train columns"
assert set(validation.columns) == set(expected_columns), "Mismatch in validation columns"
assert set(test.columns) == set(expected_columns), "Mismatch in test columns"

In [7]:
def chunk_dataset(dataset, chunk_size):
    num_chunks = len(dataset) // chunk_size + (1 if len(dataset) % chunk_size > 0 else 0)
    for i in range(num_chunks):
        start = i * chunk_size
        end = min((i + 1) * chunk_size, len(dataset))
        yield dataset.select(range(start, end))

In [8]:
def preprocess_function(batch):
    tokenized = tokenizer(batch['text'], padding="max_length", truncation=True)
    tokenized['labels'] = np.stack([
        batch['agreeableness'],
        batch['openness'],
        batch['conscientiousness'],
        batch['extraversion'],
        batch['neuroticism']
    ], axis=-1).astype(np.float16)
    return tokenized

In [9]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=5,
    problem_type='regression'
)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=40,
    per_device_eval_batch_size=40,
    warmup_steps=500,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    dataloader_num_workers=4,
    logging_dir='./logs',
    logging_steps=1000,
    load_best_model_at_end=True,
    metric_for_best_model="avg_mse",
    report_to=['wandb'],
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda




In [10]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(-1, 5)
    labels = labels.reshape(-1, 5)
    mse = mean_squared_error(labels, predictions, multioutput="raw_values")
    mae = mean_absolute_error(labels, predictions, multioutput="raw_values")
    r2 = r2_score(labels, predictions, multioutput="uniform_average")

    avg_mse = np.mean(mse)
    avg_mae = np.mean(mae)

    return {
        "avg_mse": avg_mse,
        "avg_mae": avg_mae,
        "r2": r2,
    }

In [None]:
train_chunk_size = 100000
validation_chunk_size = 25000

train_dataset = Dataset.from_pandas(train)
validation_dataset = Dataset.from_pandas(validation)

for i, train_chunk in enumerate(chunk_dataset(train_dataset, train_chunk_size)):
    print(f"Processing train chunk {i+1}")

    train_chunk = train_chunk.map(preprocess_function, batched=True)
    train_chunk.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    validation_chunk = next(chunk_dataset(validation_dataset, validation_chunk_size))
    validation_chunk = validation_chunk.map(preprocess_function, batched=True)
    validation_chunk.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_chunk,
        eval_dataset=validation_chunk,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    trainer.train()

    text = "I love to go out so much. I love my friends. I am also a student at university who loves life!"

    with torch.no_grad():
        outputs = self.model(**tokenizer(text))
        logits = outputs.logits

    scores = torch.softmax(logits, dim=1).squeeze().tolist()
    trait_names = ["agreeableness", "openness", "conscientiousness", "extraversion", "neuroticism"]
    ocean5_scores = {trait_names[i]: score for i, score in enumerate(scores)}
    print(ocean5_scores)

    output_dir = f'/content/saved_model_chunk_{i+1}'
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model saved after processing train chunk {i+1} at {output_dir}")

    zip_path = f"/content/saved_model_chunk_{i+1}.zip"
    shutil.make_archive(zip_path.replace('.zip', ''), 'zip', output_dir)

Processing train chunk 1


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")