In [1]:
from pathlib import Path
paths = [str(x) for x in Path('C:/langchain2/wiki').glob('**/*.txt')]
paths

['C:\\langchain2\\wiki\\Chikungunya.txt',
 'C:\\langchain2\\wiki\\dengue.txt',
 'C:\\langchain2\\wiki\\Malaria.txt',
 'C:\\langchain2\\wiki\\Tuberculosis.txt',
 'C:\\langchain2\\wiki\\Typhoid_fever.txt',
 'C:\\langchain2\\wiki\\mobilbert_desase\\merges.txt']

In [2]:
# initialize an empty string to hold all the text
text = ""

# loop over the file paths and read each file into the text string
for path in paths:
    with open(path, 'r', encoding='utf-8') as f:
        text += f.read()


In [3]:
text = text.replace('\n', ' ')

In [4]:
text



In [5]:
print("Length of text: {} characters".format(len(text)))

Length of text: 277998 characters


In [6]:
# split the text into smaller chunks
chunks = [text[i:i+1000] for i in range(0, len(text), 5000)]

In [7]:
num_chunks = len(chunks)
print("Number of chunks created:", num_chunks)

Number of chunks created: 56


In [8]:
from transformers import AutoTokenizer
import torch 

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

  from .autonotebook import tqdm as notebook_tqdm


In [9]:






# create features for each chunk
features = []
for chunk in chunks:
    encoding = tokenizer.encode_plus(
        chunk,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors='pt'
    )
    features.append(encoding)

# stack the features
input_ids = torch.cat([f['input_ids'] for f in features], dim=0)
attention_mask = torch.cat([f['attention_mask'] for f in features], dim=0)
token_type_ids = torch.cat([f['token_type_ids'] for f in features], dim=0)


In [10]:
input_ids

tensor([[     0,   5585,  49314,  ...,      1,      1,      1],
        [     0,  21870,  31075,  ...,      1,      1,      1],
        [     0,      6,   9815,  ...,      1,      1,      1],
        ...,
        [     0,      6, 244871,  ...,    712,    233,      2],
        [     0,     83,   8597,  ...,      1,      1,      1],
        [     0,    104,      6,  ...,      1,      1,      1]])

In [11]:
encodings = {
    'input_ids': input_ids,
    'attention_mask': attention_mask,
    'labels': token_type_ids
}

In [12]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    def __getitem__(self,i):
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [13]:
dataset = Dataset(encodings)

In [14]:
dataset

<__main__.Dataset at 0x2137691c4c0>

In [15]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [16]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x21321803070>

In [17]:
# from transformers import RobertaConfig

In [18]:
# config = RobertaConfig(
#     vocab_size=30_000,
#     max_position_embeddings=514,
#     hidden_size=768,
#     num_attention_heads=12,
#     num_hidden_layers=4,
#     type_vocab_size=1


# )

In [19]:
from transformers import AutoModelForMaskedLM
# from transformers import RobertaForMaskedLM


In [20]:
# model=RobertaForMaskedLM(config=config)

In [21]:
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")

In [35]:
device=torch.device('gpu')
if torch.cuda.is_available():
    print('using gpu')
else:
    print('using cpu')

RuntimeError: Expected one of cpu, cuda, ipu, xpu, mkldnn, opengl, opencl, ideep, hip, ve, fpga, ort, xla, lazy, vulkan, mps, meta, hpu, mtia, privateuseone device type at start of device string: gpu

In [23]:
# device = torch.device('cpu')


In [24]:
model.to(device)

XLMRobertaForMaskedLM(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
  

In [25]:
from transformers import AdamW
from tqdm.auto import tqdm


In [26]:
optim = AdamW(model.parameters(), lr=1e-5)



In [27]:
batch_size = 1
epoch = 4
loop = tqdm(dataloader, leave=True)
for i, batch in enumerate(loop):
    if i % batch_size == 0:
        optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    loss.backward()
    if (i + 1) % batch_size == 0:
        optim.step()
        loop.set_description(f'Epoch:{epoch}')
        loop.set_description(f'loss:{loss.item()}')

    # Free up GPU memory after every batch
    del input_ids, attention_mask, labels, outputs
    torch.cuda.empty_cache()


loss:21.64414405822754: 100%|██████████| 4/4 [14:45<00:00, 221.49s/it] 


In [28]:
model.save_pretrained('C:/langchain2/roberta')

In [30]:
# Preprocess your specific text using the same tokenizer
text = "On 5 December 2022 the European Medicines Agency approved Qdenga, a live tetravalent attenuated vaccine for adults, adolescents and kids from four years of age, produced by Takeda Pharmaceutical Company"
encoded_text = tokenizer(text, truncation=True, padding=True, return_tensors='pt')

# Classify the text
prediction = model.predict(encoded_text)[0][0]

# Interpret the prediction result
if prediction > 0.5:
    print("The text belongs to the positive class")
else:
    print("The text belongs to the negative class")


AttributeError: 'XLMRobertaForMaskedLM' object has no attribute 'predict'

In [34]:

# Define the input sentences
sentence_a = "Dengue is caused by "
sentence_b = "a bactria"

# Encode the input sentences
encoded_inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt')

# Generate the prediction
model.eval()
with torch.no_grad():
    outputs = model(**encoded_inputs)

# Interpret the prediction
probs = torch.softmax(outputs[0], dim=1)
next_sentence_proba = probs.squeeze()[0].item()

# Print the predicted probability
print("Probability of next sentence:", next_sentence_proba)

RuntimeError: a Tensor with 250002 elements cannot be converted to Scalar