In [14]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import os

# Importing the T5 modules from huggingface/transformers
from transformers import RobertaTokenizer, T5ForConditionalGeneration
import json
from tqdm.notebook import tqdm

In [15]:
model_params = {
    "MODEL": "Salesforce/codet5-base",  # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE": 32,  # training batch size
    "VALID_BATCH_SIZE": 32,  # validation batch size
    "TRAIN_EPOCHS": 10,  # number of training epochs
    "VAL_EPOCHS": 2,  # number of validation epochs
    "LEARNING_RATE": 1e-4,  # learning rate
    "MAX_SOURCE_TEXT_LENGTH": 512,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH": 50,  # max length of target text
    "SEED": 42,  # set seed for reproducibility
}

In [16]:
def load_data(path,tokenizer):
    sources=[]
    targets=[]
    
    with open(f'data/{path}.jsonl', encoding="utf-8") as f:
        for idx, line in enumerate(f):
            line = line.strip()
            obj=json.loads(line)
            if idx>74000 and obj['language'] =='python3':
                # source=obj['description']+tokenizer.sep_token+obj['solutions']
                source = obj['solutions']

                for t in obj['test_cases']:
                    sources.append(source)
                    targets.append(t+tokenizer.eos_token)
                for t in obj['private_tests']:
                    sources.append(source)
                    targets.append(t+tokenizer.eos_token)
            if idx>74010:
                break

    df=pd.DataFrame()
    df['source']=sources
    df['target']=targets
    return df

In [17]:
class YourDataSetClass(Dataset):
    """
    Creating a custom dataset for reading the dataset and
    loading it into the dataloader to pass it to the
    neural network for finetuning the model

    """

    def __init__(
        self, dataframe, tokenizer, source_len, target_len, source_text, target_text
    ):
        """
        Initializes a Dataset class

        Args:
            dataframe (pandas.DataFrame): Input dataframe
            tokenizer (transformers.tokenizer): Transformers tokenizer
            source_len (int): Max length of source text
            target_len (int): Max length of target text
            source_text (str): column name of source text
            target_text (str): column name of target text
        """
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

    def __len__(self):
        """returns the length of dataframe"""

        return len(self.target_text)

    def __getitem__(self, index):
        """return the input ids, attention masks and target ids"""

        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        # cleaning data so as to ensure data is in string type
        source_text = " ".join(source_text.split())
        target_text = " ".join(target_text.split())

        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        target = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.summ_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze()
        target_mask = target["attention_mask"].squeeze()

        return {
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "target_ids": target_ids.to(dtype=torch.long),
            "target_ids_y": target_ids.to(dtype=torch.long),
        }

In [18]:
def inference(model,data,tokenizer,device):
    """
    Inference function for the model
    """
    model.eval()
    
    # cleaning data so as to ensure data is in string type
    source_text = " ".join(data.split())

    source = tokenizer.batch_encode_plus(
        [source_text],
        max_length=512,
        pad_to_max_length=True,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )

    source_ids = source["input_ids"].to(device, dtype = torch.long)
    source_mask = source["attention_mask"].to(device, dtype = torch.long)

    generated_ids = model.generate(
    input_ids = source_ids,
    attention_mask = source_mask, 
    max_length=150, 
    num_beams=2,
    repetition_penalty=2.5, 
    length_penalty=1.0, 
    early_stopping=True
    )

    preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
    return preds

In [19]:
device = torch.device("cuda:0")
tokenizer = RobertaTokenizer.from_pretrained('outputs/model_files0')
model = T5ForConditionalGeneration.from_pretrained('outputs/model_files1')

model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [20]:
df=load_data('seq_data2',tokenizer)
df

Unnamed: 0,source,target
0,You are given a binary string of length n (i. ...,"input: ""3\n8 5\n11011010\n7 9\n1111100\n7 11\n..."
1,You are given a binary string of length n (i. ...,"input: ""2\n8 5\n11011010\n7 9\n1111100\n""\nout..."
2,You are given a binary string of length n (i. ...,"input: ""1\n2 1\n00\n""\noutput: ""00\n""\n</s>"
3,You are given a binary string of length n (i. ...,"input: ""3\n8 5\n11011010\n7 9\n1111100\n7 11\n..."
4,You are given a binary string of length n (i. ...,"input: ""2\n8 5\n11011010\n7 9\n1111100\n""\nout..."
5,You are given a binary string of length n (i. ...,"input: ""1\n2 1\n00\n""\noutput: ""00\n""\n</s>"
6,You are given a binary string of length n (i. ...,"input: ""3\n8 5\n11011010\n7 9\n1111100\n7 11\n..."
7,You are given a binary string of length n (i. ...,"input: ""2\n8 5\n11011010\n7 9\n1111100\n""\nout..."
8,You are given a binary string of length n (i. ...,"input: ""1\n2 1\n00\n""\noutput: ""00\n""\n</s>"
9,You are given a binary string of length n (i. ...,"input: ""3\n8 5\n11011010\n7 9\n1111100\n7 11\n..."


In [21]:
# df
'''
with open('data/seq_data_testsample.jsonl','w', encoding='utf-8') as write_file:
    for line in df['sorce'].to_json(orient='records') :
        print(line)
        # write_file.write(json.dump(line, ensure_ascii='False') + '\n')
'''

df['source'][0]
df['target'][0]

'input: "3\\n8 5\\n11011010\\n7 9\\n1111100\\n7 11\\n1111100\\n"\noutput: "01011110\\n0101111\\n0011111\\n"\n</s>'

In [22]:
data=df['source'][3]
print(data)


You are given a binary string of length n (i. e. a string consisting of n characters '0' and '1').

In one move you can swap two adjacent characters of the string. What is the lexicographically minimum possible string you can obtain from the given one if you can perform no more than k moves? It is possible that you do not perform any moves at all.

Note that you can swap the same pair of adjacent characters with indices i and i+1 arbitrary (possibly, zero) number of times. Each such swap is considered a separate move.

You have to answer q independent test cases.

Input

The first line of the input contains one integer q (1 ≤ q ≤ 10^4) — the number of test cases.

The first line of the test case contains two integers n and k (1 ≤ n ≤ 10^6, 1 ≤ k ≤ n^2) — the length of the string and the number of moves you can perform.

The second line of the test case contains one string consisting of n characters '0' and '1'.

It is guaranteed that the sum of n over all test cases does not exceed 10^

In [23]:
df['target'][0]

'input: "3\\n8 5\\n11011010\\n7 9\\n1111100\\n7 11\\n1111100\\n"\noutput: "01011110\\n0101111\\n0011111\\n"\n</s>'

In [24]:
s=tokenizer.batch_encode_plus(
            [data],
            max_length=512,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )['input_ids']
m=[tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in s]
print(m[0])

You are given a binary string of length n (i. e. a string consisting of n characters '0' and '1').

In one move you can swap two adjacent characters of the string. What is the lexicographically minimum possible string you can obtain from the given one if you can perform no more than k moves? It is possible that you do not perform any moves at all.

Note that you can swap the same pair of adjacent characters with indices i and i+1 arbitrary (possibly, zero) number of times. Each such swap is considered a separate move.

You have to answer q independent test cases.

Input

The first line of the input contains one integer q (1 ≤ q ≤ 10^4) — the number of test cases.

The first line of the test case contains two integers n and k (1 ≤ n ≤ 10^6, 1 ≤ k ≤ n^2) — the length of the string and the number of moves you can perform.

The second line of the test case contains one string consisting of n characters '0' and '1'.

It is guaranteed that the sum of n over all test cases does not exceed 10^

In [25]:
print(data.split(tokenizer.sep_token)[1])

t = int(input())
for _ in range(t):
    n, k = map(int, input().split())
    s = list(input().strip())
    last0 = -1
    for i in range(n):
        if s[i] == '0':
            s[i] = '1'
            v = i - last0 - 1
            if k > v:
                k -= v
                s[last0 + 1] = '0'
                last0 += 1
            else:
                s[i - k] = '0'
                break
    print(*s, sep='')



In [26]:
io=inference(model,data,tokenizer,device)
io

['']

: 