In [1]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
!pip install seaborn

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [3]:
import pandas as pd
import random
import csv
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from transformers import  DataCollatorWithPadding
from transformers import get_scheduler
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import Adafactor
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
triples = []
total = 0
files = ["test.txt", "train.txt", "valid.txt"]
for f in files:
    with open(f'/workspace/tanu/BTP-2/exp/knowledge infusion/aviationKG/AviationKG_{f}', newline='') as file:
        lines = file.readlines()
        total += len(lines)
        for line in lines:
            tokens = line.split("\t")
            if len(tokens) != 3:
                print(tokens)
                continue
                
            tokens =  [token.strip() for token in tokens]
            tokens[0] = tokens[0][13:].strip()
            triples.append(tokens)
        
        

In [5]:
len(triples)

96686

In [6]:
triples[1]

['r_LAX00LA285', 'type', 'NamedIndividual']

In [7]:
random.seed(0)
random.shuffle(triples)

In [8]:
df = pd.DataFrame(triples, columns =['subject', 'relation', 'object'])


In [9]:
len(df)

96686

In [10]:
df.head()

Unnamed: 0,subject,relation,object
0,r_FTW03LA001,type,Accident_Number
1,N366KR,hasEmergencyLocatorTransmitterInstalled,false
2,r_CHI04LA052,unitOfTemperature,degreeCelsius
3,r_LAX02LA036,hasWindSpeed,4
4,r_LAX02LA074,unitOfDewPoint,degreeCelsius


In [11]:
def masked_subject(row):
    return "<extra_id_0> {} {}".format(row["relation"], row["object"])
def masked_object(row):
    return "{} {} <extra_id_0>".format(row["subject"], row["relation"])

In [12]:
def form_label(row):
    return "<extra_id_0> {}.".format(row)
    

In [13]:
df["label"] = np.where(df.index % 2, df["subject"].map(form_label), df["object"].map(form_label))


In [14]:
df.head()

Unnamed: 0,subject,relation,object,label
0,r_FTW03LA001,type,Accident_Number,<extra_id_0> Accident_Number.
1,N366KR,hasEmergencyLocatorTransmitterInstalled,false,<extra_id_0> N366KR.
2,r_CHI04LA052,unitOfTemperature,degreeCelsius,<extra_id_0> degreeCelsius.
3,r_LAX02LA036,hasWindSpeed,4,<extra_id_0> r_LAX02LA036.
4,r_LAX02LA074,unitOfDewPoint,degreeCelsius,<extra_id_0> degreeCelsius.


In [15]:
df["input"] = np.where(df.index % 2, df.apply(masked_subject, axis = 1),df.apply(masked_object, axis = 1))


In [16]:
df.head()

Unnamed: 0,subject,relation,object,label,input
0,r_FTW03LA001,type,Accident_Number,<extra_id_0> Accident_Number.,r_FTW03LA001 type <extra_id_0>
1,N366KR,hasEmergencyLocatorTransmitterInstalled,false,<extra_id_0> N366KR.,<extra_id_0> hasEmergencyLocatorTransmitterIns...
2,r_CHI04LA052,unitOfTemperature,degreeCelsius,<extra_id_0> degreeCelsius.,r_CHI04LA052 unitOfTemperature <extra_id_0>
3,r_LAX02LA036,hasWindSpeed,4,<extra_id_0> r_LAX02LA036.,<extra_id_0> hasWindSpeed 4
4,r_LAX02LA074,unitOfDewPoint,degreeCelsius,<extra_id_0> degreeCelsius.,r_LAX02LA074 unitOfDewPoint <extra_id_0>


In [17]:
df.relation.unique()

array(['type', 'hasEmergencyLocatorTransmitterInstalled',
       'unitOfTemperature', 'hasWindSpeed', 'unitOfDewPoint',
       'IsCausedBy', 'isCausedByEnvironmentIssue', 'hasPrecipitation',
       'hasObservationFacility', 'hasRatedPower', 'unitOfVisibility',
       'hasConditionsAtAccidentSite', 'hasOperator', 'hasPilot',
       'unitOfDirectionFromAccidentSite', 'unitOfRatedPower',
       'hasSecondPilotPresent', 'hasAircraftFire', 'unitOfElevation',
       'hasAirworthinessCertificate', 'IsCausedBecause',
       'hasAircraftExplosion', 'hasSerialNumber', 'hasTemperature',
       'wasToxicologyPerformed', 'unitOfGusts', 'occurredAtCountry',
       'hasSecondEvent', 'hasMedicalCertification', 'hasRegisteredOwner',
       'hasLowestCloudCondition', 'unitOfDistanceFromAccidentSite',
       'isCausedByPersonnelIssue', 'isCauseddueto-PersonnelIssue',
       'hasTurbulenceSeverityForecast', 'hasFederalAviationRegulation',
       'hasCausalAgent_FlightCrew', 'hasLowestCeiling', 'isAmateurB

In [18]:
df["relation"]

0                                           type
1        hasEmergencyLocatorTransmitterInstalled
2                              unitOfTemperature
3                                   hasWindSpeed
4                                 unitOfDewPoint
                          ...                   
96681                           hasRestraintUsed
96682                                       type
96683                                       type
96684                    hasLowestCloudCondition
96685                                       type
Name: relation, Length: 96686, dtype: object

In [19]:
to_rem = ['http://purl.org/dc/elements/1.1/source', 'http://purl.org/dc/elements/1.1/description', 'imports']
df = df[~df.relation.isin(to_rem)]


In [20]:
len(df), df.head(), df.reset_index()


(96625,
         subject                                 relation           object  \
 0  r_FTW03LA001                                     type  Accident_Number   
 1        N366KR  hasEmergencyLocatorTransmitterInstalled            false   
 2  r_CHI04LA052                        unitOfTemperature    degreeCelsius   
 3  r_LAX02LA036                             hasWindSpeed                4   
 4  r_LAX02LA074                           unitOfDewPoint    degreeCelsius   
 
                            label  \
 0  <extra_id_0> Accident_Number.   
 1           <extra_id_0> N366KR.   
 2    <extra_id_0> degreeCelsius.   
 3     <extra_id_0> r_LAX02LA036.   
 4    <extra_id_0> degreeCelsius.   
 
                                                input  
 0                     r_FTW03LA001 type <extra_id_0>  
 1  <extra_id_0> hasEmergencyLocatorTransmitterIns...  
 2        r_CHI04LA052 unitOfTemperature <extra_id_0>  
 3                        <extra_id_0> hasWindSpeed 4  
 4           r_LAX

In [21]:
df.head()

Unnamed: 0,subject,relation,object,label,input
0,r_FTW03LA001,type,Accident_Number,<extra_id_0> Accident_Number.,r_FTW03LA001 type <extra_id_0>
1,N366KR,hasEmergencyLocatorTransmitterInstalled,false,<extra_id_0> N366KR.,<extra_id_0> hasEmergencyLocatorTransmitterIns...
2,r_CHI04LA052,unitOfTemperature,degreeCelsius,<extra_id_0> degreeCelsius.,r_CHI04LA052 unitOfTemperature <extra_id_0>
3,r_LAX02LA036,hasWindSpeed,4,<extra_id_0> r_LAX02LA036.,<extra_id_0> hasWindSpeed 4
4,r_LAX02LA074,unitOfDewPoint,degreeCelsius,<extra_id_0> degreeCelsius.,r_LAX02LA074 unitOfDewPoint <extra_id_0>


In [22]:
len(df)

96625

In [23]:
df =  df.sort_values(by=['subject', 'object'], ascending=True).reset_index(drop = True)


In [24]:
import pickle
file = open("c4_masked.pickle",'rb')
c4df = pickle.load(file)
file.close()


In [25]:
d1 = c4df[['input', 'label']].sample(frac=1).reset_index(drop = True).iloc[:len(df)].copy()
d2 = df[['input', 'label']].copy()


In [26]:
d3 = pd.concat([d1,d2])
d3 = d3.sample(frac=1).reset_index(drop = True)


In [27]:
len(d1), len(d2), len(d3)

(96625, 96625, 193250)

In [28]:
d1.head()

Unnamed: 0,input,label
0,"Back by popular demand, see ""The Closest thing...",<extra_id_0> Milton Theatre.
1,More information on hiking in and around <extr...,<extra_id_0> Vienna.
2,"In the past, <extra_id_0> tourism focused on t...",<extra_id_0> Beijing.
3,This site is for managing the <extra_id_0> Res...,<extra_id_0> “.
4,The district court declined to grant a motion ...,<extra_id_0> Oppenheimer.


In [29]:
d2.head()

Unnamed: 0,input,label
0,<extra_id_0> comment,<extra_id_0> .
1,<extra_id_0> defintion A gust or wind gust is ...,<extra_id_0> .
2,<extra_id_0> IsCausedBecause ABRUPT,<extra_id_0> .
3,IsCausedBecause <extra_id_0>,<extra_id_0> ABRUPT.
4,<extra_id_0> IsCausedBecause ABRUPT,<extra_id_0> .


In [30]:
d3.head()

Unnamed: 0,input,label
0,N515KH hasEmergencyLocatorTransmitterInstalled...,<extra_id_0> false.
1,r_MIA05LA036 occurredAtLatitude <extra_id_0>,<extra_id_0> 30.6375.
2,Last minute prep for <extra_id_0>.,<extra_id_0> Winter Storm Jonas.
3,"<extra_id_0> was born August 7, 1958, to paren...",<extra_id_0> Danny Scott Easterling.
4,r_DEN06MA119 IsCausedBy <extra_id_0>,<extra_id_0> WINGSPAR.


In [31]:
train_data = d3.copy()

In [32]:
tokenizer = T5Tokenizer.from_pretrained("t5-large")
model = T5ForConditionalGeneration.from_pretrained("t5-large")
# model = T5ForConditionalGeneration.from_pretrained("/workspace/tanu/BTP-2/exp/knowledge infusion/trained models/KGinfusedLM/6")
# config = T5Config(dropout_rate  = 0.1)  # default value only, do we need to set it explicitly
# model = T5ForConditionalGeneration.config(config).from_pretrained("t5-large")



For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [33]:
def tokenize(text):
    return tokenizer(text, return_tensors="pt",  padding='longest', truncation=True )
def tokenize_target(text):
    return tokenizer(text, return_tensors="pt",   padding="longest",  truncation=True).input_ids

In [34]:
optimizer = Adafactor(model.parameters(), lr=1e-3, relative_step = False)
# parameters as specified in the paper
# num_epochs = 380
# batch_size = 1024
num_epochs = 25
batch_size = 16
num_training_steps = num_epochs * (train_data.shape[0] // batch_size )


In [35]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"


In [36]:
# model = torch.nn.DataParallel(model, device_ids=[0, 1])
# device = torch.device("cuda:1") if torch.cuda.is_available() else torch.device("cpu")
# device

In [37]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [38]:
train_data.head()

Unnamed: 0,input,label
0,N515KH hasEmergencyLocatorTransmitterInstalled...,<extra_id_0> false.
1,r_MIA05LA036 occurredAtLatitude <extra_id_0>,<extra_id_0> 30.6375.
2,Last minute prep for <extra_id_0>.,<extra_id_0> Winter Storm Jonas.
3,"<extra_id_0> was born August 7, 1958, to paren...",<extra_id_0> Danny Scott Easterling.
4,r_DEN06MA119 IsCausedBy <extra_id_0>,<extra_id_0> WINGSPAR.


In [39]:
len(train_data)

193250

In [40]:
def run_data(model, data, batch_size, optimizer, tokenizer, device, eval_mode):
    iters = int(np.ceil(data.shape[0] / batch_size))
    avg_loss = 0
    step = 0
    p_bar = tqdm(total=iters, position=0, leave=True, desc='Running through data')
    for row_idx in range(0, data.shape[0], batch_size):
        upper_idx = min(row_idx + batch_size, data.shape[0]) -1
        
        labels = data.loc[row_idx : upper_idx]['label'].tolist()
        inputs = data.loc[row_idx : upper_idx]['input'].tolist()
        tokenized_labels = tokenize_target(labels)
        tokenized_input = tokenize(inputs)

        input_ids = tokenized_input["input_ids"].to(device)
        attention_mask = tokenized_input["attention_mask"].to(device)
        labels = tokenized_labels.to(device)
        labels[labels == tokenizer.pad_token_id] = -100
        loss = model(input_ids= input_ids, attention_mask= attention_mask,labels= labels).loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loss_item = loss.detach().clone().item()
        avg_loss = (avg_loss * step + loss_item) / (step + 1)

        p_bar.set_postfix(avg_loss=avg_loss)
        p_bar.update(1)
        step += 1

    p_bar.close()
    return model, optimizer, avg_loss

In [41]:
torch.cuda.empty_cache()


In [42]:
to_save_epochs = [1,2,5,10,15,20,25]
# to_save_epochs = [0,1,4,9,14,19,24]

In [None]:
train_losses = []     
for epoch in range(0,num_epochs):
        shuffled_train_data = train_data.sample(frac=1).reset_index() 
        model.train()
        optimizer.zero_grad() 
        model, optimizer, avg_train_loss = run_data(model, shuffled_train_data, batch_size, \
                                optimizer, tokenizer, device, eval_mode=False)
        train_losses.append(avg_train_loss)
        if epoch in to_save_epochs:
            model.save_pretrained(f"aviation/trained models/KGinfusedLM/{epoch}", from_pt=True) 
        print(f'Epoch {epoch}:\tTrain loss: {avg_train_loss}')



Running through data:  26%|▎| 3082/12079 [14:35<43:44,  3.43it/s, avg_loss=1.45]

In [None]:
train_losses