Requirements.txt

torch 2.0.0+cu117
transformers 4.28.1
numpy
typing
tqdm
gc
pandas
collections
sklearn
contextlib
io
IPython
os

In [1]:
!pip install torch transformers -q

In [2]:
!nvidia-smi

Mon Apr 24 00:29:47 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:1B:00.0 Off |                  Off |
|  0%   34C    P8    25W / 460W |   4531MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:1C:00.0 Off |                  N/A |
| 27%   33C    P8    15W / 260W |      8MiB / 11264MiB |      0%      Default |
|       

In [4]:
import os

In [5]:
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [6]:
import torch

if torch.cuda.is_available():
    device='cuda'
    print('GPU')
else:
    device='cpu'
    print('CPU')
    
    
SEED = 0
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

GPU


In [7]:
import pandas as pd

path_data_en = "SemEval2018-Task9/training/data/1A.english.training.data.txt"
path_gold_en = "SemEval2018-Task9/training/gold/1A.english.training.gold.txt"

train_data_en_data = pd.read_csv(path_data_en, header=None, sep="\t", names=['term', 'relation'])
train_gold_en_data = pd.read_csv(path_gold_en, header=None, names=['hypernym'])

train_data_en_data.head()

path_test_data_en = "SemEval2018-Task9/test/data/1A.english.test.data.txt"
path_test_gold_en = "SemEval2018-Task9/test/gold/1A.english.test.gold.txt"

test_data_en_data = pd.read_csv(path_test_data_en, header=None, sep="\t", names=['term', 'relation'])
test_gold_en_data = pd.read_csv(path_test_gold_en, header=None, names=['hypernym'])

def standard_preprcessing(train_features, train_target, test_features, test_target):
    
    train_data_en = train_features.copy()
    train_data_en = 'hyperonym: ' + train_data_en.term + ' | hyponyms:'
    print(train_data_en.head())

    train_gold_en = train_target.copy()
    train_gold_en = train_gold_en.hypernym.str.split('\t').str.join(', ')
    print(train_gold_en.head())
    
    test_data_en = test_features.copy()
    test_data_en = 'hyperonym: ' + test_data_en.term + ' | hyponyms:'
    print(test_data_en.head())

    test_gold_en = test_target.copy()
    test_gold_en = test_gold_en.hypernym.str.split('\t').str.join(', ')
    print(test_gold_en.head())
    
    return train_data_en, train_gold_en, test_data_en, test_gold_en


train_data_en, train_gold_en, test_data_en, test_gold_en = standard_preprcessing(train_data_en_data, 
                                                                                 train_gold_en_data, 
                                                                                 test_data_en_data, 
                                                                                 test_gold_en_data)

0          hyperonym: blackfly | hyponyms:
1          hyperonym: Turonian | hyponyms:
2        hyperonym: abhorrence | hyponyms:
3    hyperonym: tropical storm | hyponyms:
4    hyperonym: militarization | hyponyms:
Name: term, dtype: object
0                           homopterous insect, insect
1    technical specification, geologic timescale, p...
2                      distaste, hatred, hate, disgust
3    atmosphere, windstorm, violent storm, air curr...
4                                       social control
Name: hypernym, dtype: object
0    hyperonym: maliciousness | hyponyms:
1          hyperonym: buckler | hyponyms:
2        hyperonym: spelunker | hyponyms:
3     hyperonym: quo warranto | hyponyms:
4     hyperonym: Jeff Francis | hyponyms:
Name: term, dtype: object
0       malevolence, distaste, hatred, hate, malignity
1                                           body armor
2                    exploration, adventurer, explorer
3    proceedings, legal proceedings, proceeding, du..

# Experiment

In [8]:
from transformers import T5ForConditionalGeneration, AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig

In [9]:
model_checkpoint = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, max_length=100, block_size=64)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [10]:
from evaluation.data_collat import DataCollatorWithPadding
from evaluation.experiment import Experiment

1. train params/predict params [1, 1]
2. train / params predict [0, 1]
3. params predict [0, 0]

## Change only parmeters in generating with no training

In [11]:
check_param = {'GenArgs': {'num_beams': [2, 10]}}

freezed_param = {'TrainArgs': {'num_train_epochs':1, 'per_device_train_batch_size':16, 'save_steps':1}, 
                 'GenArgs': {'max_length': 3, 'early_stopping': True}, 
                 'SelectStrategy': None, 'PredForm': None}


In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


path_data = '/home/jovyan/work/SemEval2018-Task9/test/gold/1A.english.test.gold.txt'

experiment1 = Experiment(output_dir='/home/jovyan/work',
                         check_param=check_param, freezed_param=freezed_param,
                         model=model, tokenizer=tokenizer, device=device,
                         data_collator=data_collator,
                         data_train=train_data_en.tolist(), target_train=train_gold_en.tolist(), 
                         data_test=test_data_en.tolist(), target_test=test_gold_en.tolist(), strategy=[0, 0], 
                         path_to_test=path_data
                        )

In [13]:
experiment1.run_experiment()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:42<00:00, 35.07it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:46<00:00, 32.09it/s]


Unnamed: 0_level_0,MRR,MAP,P@1,P@3,P@5,P@15
Meta-Params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
{'GenArgs': {'num_beams': 2}},0.00133,0.00034,0.00133,0.00044,0.00027,0.0002
{'GenArgs': {'num_beams': 10}},0.00133,0.00034,0.00133,0.00044,0.00027,0.0002


## Change only parmeters in generating with training

In [12]:
check_param = {'GenArgs': {'num_beams': [2, 10]}}

freezed_param = {'TrainArgs': {'num_train_epochs':1, 'per_device_train_batch_size':16, 'save_steps':1}, 
                 'GenArgs': {'max_length': 3, 'early_stopping': True}, 
                 'SelectStrategy': None, 'PredForm': None}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


path_data = '/home/jovyan/work/SemEval2018-Task9/test/gold/1A.english.test.gold.txt'

experiment1 = Experiment(output_dir='/home/jovyan/work',
                         check_param=check_param, freezed_param=freezed_param,
                         model=model, tokenizer=tokenizer, device=device,
                         data_collator=data_collator,
                         data_train=train_data_en.tolist(), target_train=train_gold_en.tolist(), 
                         data_test=test_data_en.tolist(), target_test=test_gold_en.tolist(),
                         strategy=[0, 1], 
                         path_to_test=path_data
                        )

In [13]:
experiment1.run_experiment()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


100%|███████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:37<00:00, 40.43it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:41<00:00, 36.58it/s]


Unnamed: 0_level_0,MRR,MAP,P@1,P@3,P@5,P@15
Meta-Params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
{'GenArgs': {'num_beams': 2}},0.0,0.0,0.0,0.0,0.0,0.0
{'GenArgs': {'num_beams': 10}},0.00067,0.00019,0.00067,0.00022,0.00013,0.00013


## Change generating and training parmeters

In [16]:
check_param = {'TrainArgs': {'num_train_epochs':[1, 2]}, 'GenArgs': {'num_beams': [2, 10]}}

freezed_param = {'TrainArgs': {'per_device_train_batch_size':16, 'save_steps':1}, 
                 'GenArgs': {'max_length': 3, 'early_stopping': True}, 
                 'SelectStrategy': None, 'PredForm': None}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


path_data = '/home/jovyan/work/SemEval2018-Task9/test/gold/1A.english.test.gold.txt'

experiment1 = Experiment(output_dir='/home/jovyan/work',
                         check_param=check_param, freezed_param=freezed_param,
                         model=model, tokenizer=tokenizer, device=device,
                         data_collator=data_collator,
                         data_train=train_data_en.tolist(), target_train=train_gold_en.tolist(), 
                         data_test=test_data_en.tolist(), target_test=test_gold_en.tolist(),
                         strategy=[1, 1], 
                         path_to_test=path_data
                        )

In [17]:
experiment1.run_experiment()



Step,Training Loss


100%|███████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:37<00:00, 40.19it/s]


Step,Training Loss


100%|███████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:41<00:00, 36.44it/s]


Step,Training Loss


100%|███████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:37<00:00, 40.52it/s]


Step,Training Loss


100%|███████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:41<00:00, 36.30it/s]


Unnamed: 0_level_0,MRR,MAP,P@1,P@3,P@5,P@15
Meta-Params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"{'TrainArgs': {'num_train_epochs': 1}, 'GenArgs': {'num_beams': 2}}",0.00133,0.00052,0.00133,0.00044,0.00044,0.00044
"{'TrainArgs': {'num_train_epochs': 1}, 'GenArgs': {'num_beams': 10}}",0.00467,0.00147,0.00467,0.00156,0.00118,0.00109
"{'TrainArgs': {'num_train_epochs': 2}, 'GenArgs': {'num_beams': 2}}",0.00533,0.00168,0.00533,0.00178,0.00137,0.00124
"{'TrainArgs': {'num_train_epochs': 2}, 'GenArgs': {'num_beams': 10}}",0.008,0.00269,0.008,0.00278,0.00219,0.00209


In [18]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Mon Apr 24 00:13:24 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:1B:00.0 Off |                  Off |
|  0%   34C    P8    25W / 460W |   4531MiB / 24564MiB |      0%      Default |
|                               |            

In [14]:
!rm -rf experiment

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
