In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
import os
import sys
import json
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from transformers import RobertaTokenizer, RobertaConfig
import torch
import csv

In [None]:
os.chdir('/content/drive/MyDrive/text_VA_prediction/')

In [None]:
from training import training
from models import RobertaForSequenceClassificationSig

In [None]:
params = {
  'batch_size' : 32,    # 增大批次大小
  'lr' : 1e-5,          # 增大学习率
  'train_epochs' : 30,  # 增加训练轮数
  'weight_decay' : 0.001, # 减小权重衰减
  'warmup_ratio': 0.05,  # 降低预热比例
}

In [None]:
class MyDataSetNew(Dataset):
    def __init__(self, filename, maxlen):

        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

        df = pd.read_csv(filename, index_col=0, dtype={'index':np.int32,'text':str,'V':np.float64, 'A':np.float64})

        self.texts = df['text'].to_list()
        self.valence = df['V'].to_list()
        self.arousal = df['A'].to_list()
        self.maxlen = maxlen

    def __getitem__(self, idx):
        item = { }
        aux = self.tokenizer(self.texts[idx], max_length=self.maxlen, truncation=True, padding=False)
        item['input_ids'] = torch.tensor(aux['input_ids'])
        item['attention_mask'] = torch.tensor(aux['attention_mask'])
        item['labels'] = torch.tensor( [ self.valence[idx], self.arousal[idx] ] )

        return item

    def __len__(self):
        return len(self.texts)

In [None]:
tt = pd.read_csv('/content/drive/MyDrive/text_VA_prediction/corpus/train_data.csv',
index_col=0,
dtype={'index':np.int32,'text':str,'V':np.float64, 'A':np.float64})

In [None]:
tt['text'][58]

'afk weekend wet weather camping skill'

In [None]:
def filter_text_column(dataframe, column_name):
    # Keep only entries that are strings
    filtered_df = dataframe[dataframe[column_name].apply(lambda x: isinstance(x, str))]
    return filtered_df

tt = filter_text_column(tt, 'text')

In [None]:
import pandas as pd

def check_text_column(dataframe, column_name):
    # Check if all entries are strings
    all_strings = dataframe[column_name].apply(lambda x: isinstance(x, str)).all()

    # Check for empty strings
    empty_strings = (dataframe[column_name] == "").any()

    if all_strings and not empty_strings:
        print("All entries are valid strings and none are empty.")
    elif not all_strings:
        non_string_indices = dataframe[~dataframe[column_name].apply(lambda x: isinstance(x, str))].index.tolist()
        print(f"Some entries are not strings at indices: {non_string_indices}")
    if empty_strings:
        empty_indices = dataframe[dataframe[column_name] == ""].index.tolist()
        print(f"There are empty strings in the column at indices: {empty_indices}")


check_text_column(tt, 'text')

All entries are valid strings and none are empty.


In [None]:
texts = tt['text'].to_list()
valence = tt['V'].to_list()
arousal = tt['A'].to_list()

In [None]:
print(texts[0:5])

['anyone knows anything history knows great social changes impossible without feminine upheaval social progress measured exactly social position fair sex ugly ones included person', 'nathan spencer offer help', 'press sees hope mecca talks', 'saw lot kids young men dads structure couldnt read beginning realize growing old crack dealer either die go jail', 'holy geez thank much birthday wishes bit overwhelming amazing definitely excellent birthweek many well wishing humbling moments beautiful everything']


In [None]:
filename_1 = '/content/drive/MyDrive/text_VA_prediction/corpus/train_data.csv'
filename_2 = '/content/drive/MyDrive/text_VA_prediction/corpus/test_data.csv'
train_split = MyDataSetNew(filename=filename_1, maxlen=200)
test_split = MyDataSetNew(filename=filename_2, maxlen=200)
dataset = [train_split, test_split]

In [None]:
preds_dir = '/content/drive/MyDrive/text_VA_prediction/'

In [None]:
config = RobertaConfig(
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
)

In [None]:
model = RobertaForSequenceClassificationSig(config)

In [None]:
model.to(device)

RobertaForSequenceClassificationSig(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768, padding_idx=1)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
      

In [None]:
training(model, params, dataset, preds_dir)

  super().__init__(*args, **kwargs)


Starting


Epoch,Training Loss,Validation Loss,Mse Valence,Mae Valence,Pearson Corr Valence,Mse Arousal,Mae Arousal,Pearson Corr Arousal
1,No log,0.020192,0.013492,0.084581,0.091613,0.026892,0.119479,0.020397
2,0.021400,0.021838,0.014287,0.091406,0.138323,0.02939,0.132726,0.059116
3,0.021400,0.019967,0.013201,0.084966,0.164313,0.026733,0.118758,0.0666
4,0.021400,0.021552,0.013807,0.086344,0.254729,0.029296,0.121801,0.098762
5,0.020700,0.020514,0.014428,0.093803,0.156126,0.026601,0.116553,0.084977
6,0.020700,0.024775,0.022434,0.122088,0.33101,0.027117,0.11704,0.098217
7,0.019700,0.019561,0.012551,0.081138,0.303711,0.02657,0.116993,0.056865
8,0.018700,0.020228,0.012515,0.081799,0.318001,0.02794,0.127328,0.160898
9,0.018700,0.019295,0.011803,0.081087,0.361104,0.026787,0.119446,0.171797
10,0.016900,0.020035,0.012485,0.087059,0.408628,0.027585,0.124097,0.2068


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.013492089696228504
mae_valence : 0.08458060026168823
pearson_corr_valence : 0.09161323803462637
mse_arousal : 0.02689233422279358
mae_arousal : 0.11947925388813019
pearson_corr_arousal : 0.020397407517912473

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.01428732369095087
mae_valence : 0.09140591323375702
pearson_corr_valence : 0.1383233830581898
mse_arousal : 0.0293896347284317
mae_arousal : 0.13272590935230255
pearson_corr_arousal : 0.05911638568624153

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.013201245106756687
mae_valence : 0.08496572822332382
pearson_corr_valence : 0.16431305040281932
mse_arousal : 0.02673332765698433
mae_arousal : 0.11875835061073303
pearson_corr_arousal : 0.06659990369382054

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.013807382434606552
mae_valence : 0.08634378761053085
pearson_corr_valence : 0.25472926331602935
mse_arousal : 0.029296495020389557
mae_arousal : 0.1218007281422615
pearson_corr_arousal : 0.09876167271176585

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.014428210444748402
mae_valence : 0.09380269050598145
pearson_corr_valence : 0.15612607711452597
mse_arousal : 0.026600724086165428
mae_arousal : 0.11655295640230179
pearson_corr_arousal : 0.08497726577865837

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.022433539852499962
mae_valence : 0.12208805233240128
pearson_corr_valence : 0.3310103209346614
mse_arousal : 0.027116848155856133
mae_arousal : 0.11704045534133911
pearson_corr_arousal : 0.09821682113493407

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.01255115307867527
mae_valence : 0.08113761991262436
pearson_corr_valence : 0.30371140256123147
mse_arousal : 0.02656995877623558
mae_arousal : 0.11699285358190536
pearson_corr_arousal : 0.05686488828562334

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.012515271082520485
mae_valence : 0.08179906010627747
pearson_corr_valence : 0.3180007883401464
mse_arousal : 0.027940019965171814
mae_arousal : 0.12732844054698944
pearson_corr_arousal : 0.1608980424974115

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.011802571825683117
mae_valence : 0.08108656853437424
pearson_corr_valence : 0.3611038766766822
mse_arousal : 0.026786653324961662
mae_arousal : 0.11944616585969925
pearson_corr_arousal : 0.17179721668654296

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.012485476210713387
mae_valence : 0.08705908805131912
pearson_corr_valence : 0.4086281580115401
mse_arousal : 0.027585001662373543
mae_arousal : 0.12409680336713791
pearson_corr_arousal : 0.2068003825120339

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.02229892462491989
mae_valence : 0.12191703170537949
pearson_corr_valence : 0.35902453355654995
mse_arousal : 0.03056349791586399
mae_arousal : 0.1268523633480072
pearson_corr_arousal : 0.19177451298484174

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.013499858789145947
mae_valence : 0.08653783053159714
pearson_corr_valence : 0.3410589949741346
mse_arousal : 0.0375247448682785
mae_arousal : 0.15855547785758972
pearson_corr_arousal : 0.10166263114678828

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.021863557398319244
mae_valence : 0.12229538708925247
pearson_corr_valence : 0.3653233444791357
mse_arousal : 0.04081911966204643
mae_arousal : 0.15164317190647125
pearson_corr_arousal : 0.18591767836772075

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.013052568770945072
mae_valence : 0.0863780528306961
pearson_corr_valence : 0.29968136209648133
mse_arousal : 0.036333534866571426
mae_arousal : 0.14029979705810547
pearson_corr_arousal : 0.16626133440156116

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.012684517540037632
mae_valence : 0.08210466057062149
pearson_corr_valence : 0.2976341864064764
mse_arousal : 0.033867932856082916
mae_arousal : 0.14960184693336487
pearson_corr_arousal : 0.04507008001936668

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.013349766843020916
mae_valence : 0.08788128197193146
pearson_corr_valence : 0.4399753555898542
mse_arousal : 0.0324086919426918
mae_arousal : 0.13304460048675537
pearson_corr_arousal : 0.21658965003846092

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.013734708540141582
mae_valence : 0.09338600188493729
pearson_corr_valence : 0.43379947703079275
mse_arousal : 0.028772447258234024
mae_arousal : 0.12245041131973267
pearson_corr_arousal : 0.22156058882907415

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.01126082893460989
mae_valence : 0.07986131310462952
pearson_corr_valence : 0.44587007748578766
mse_arousal : 0.03953887149691582
mae_arousal : 0.1515253484249115
pearson_corr_arousal : 0.20575687228823475

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.011670650914311409
mae_valence : 0.08187258988618851
pearson_corr_valence : 0.4644204967550869
mse_arousal : 0.029764806851744652
mae_arousal : 0.12461522966623306
pearson_corr_arousal : 0.20555069693839464

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.017900267615914345
mae_valence : 0.1081952303647995
pearson_corr_valence : 0.40997076962366924
mse_arousal : 0.02940770983695984
mae_arousal : 0.12509217858314514
pearson_corr_arousal : 0.2094322572761969

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.011246819980442524
mae_valence : 0.08014149218797684
pearson_corr_valence : 0.4569054410633541
mse_arousal : 0.03337525576353073
mae_arousal : 0.13351362943649292
pearson_corr_arousal : 0.235961620576659

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.011136366985738277
mae_valence : 0.07955586910247803
pearson_corr_valence : 0.4774146626083538
mse_arousal : 0.030082188546657562
mae_arousal : 0.12565423548221588
pearson_corr_arousal : 0.22827396832803132

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.011535671539604664
mae_valence : 0.08203590661287308
pearson_corr_valence : 0.47173128094868716
mse_arousal : 0.028828173875808716
mae_arousal : 0.12231690436601639
pearson_corr_arousal : 0.2374175575952164

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.011803604662418365
mae_valence : 0.08256431668996811
pearson_corr_valence : 0.4753794264326758
mse_arousal : 0.033008649945259094
mae_arousal : 0.1326301246881485
pearson_corr_arousal : 0.22761392111400858

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.01145237684249878
mae_valence : 0.08079052716493607
pearson_corr_valence : 0.48046461405474666
mse_arousal : 0.03032853826880455
mae_arousal : 0.12541362643241882
pearson_corr_arousal : 0.23423520072503798

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.011229688301682472
mae_valence : 0.08028991520404816
pearson_corr_valence : 0.4796537912469809
mse_arousal : 0.03218824788928032
mae_arousal : 0.13066792488098145
pearson_corr_arousal : 0.23137403065503384

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.011552000418305397
mae_valence : 0.08133192360401154
pearson_corr_valence : 0.4811484167888864
mse_arousal : 0.031895577907562256
mae_arousal : 0.1295439898967743
pearson_corr_arousal : 0.23335839202810937

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.01162726990878582
mae_valence : 0.08172158896923065
pearson_corr_valence : 0.48119593159813934
mse_arousal : 0.032158538699150085
mae_arousal : 0.13024358451366425
pearson_corr_arousal : 0.23486739467993742

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.011554197408258915
mae_valence : 0.08132441341876984
pearson_corr_valence : 0.48278904316825544
mse_arousal : 0.031738393008708954
mae_arousal : 0.1291808933019638
pearson_corr_arousal : 0.2336696118681315

 



Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.011460631154477596
mae_valence : 0.0809115543961525
pearson_corr_valence : 0.48401119783874486
mse_arousal : 0.03197309002280235
mae_arousal : 0.1296696811914444
pearson_corr_arousal : 0.2334028821168222

 



Could not locate the best model at /content/drive/MyDrive/text_VA_prediction/checkpoint-2889/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.



 

mse_valence : 0.011460631154477596
mae_valence : 0.0809115543961525
pearson_corr_valence : 0.48401119524241665
mse_arousal : 0.03197309002280235
mae_arousal : 0.1296696811914444
pearson_corr_arousal : 0.23340288532358971

 



In [None]:
 predictions = pd.read_csv('/content/drive/MyDrive/text_VA_prediction/predictions_fold.csv')

In [None]:
predictions

Unnamed: 0.1,Unnamed: 0,0,1
0,0,0.518497,0.447691
1,1,0.526253,0.390543
2,2,0.503622,0.392994
3,3,0.578620,0.453966
4,4,0.567622,0.474958
...,...,...,...
2562,2562,0.588261,0.465130
2563,2563,0.506692,0.355478
2564,2564,0.509176,0.425577
2565,2565,0.513565,0.411342
