In [33]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import os

# Importing the T5 modules from huggingface/transformers
from transformers import RobertaTokenizer, T5ForConditionalGeneration
import json
from tqdm.notebook import tqdm

In [34]:
model_params = {
    "MODEL": "Salesforce/codet5-base",  # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE": 32,  # training batch size
    "VALID_BATCH_SIZE": 32,  # validation batch size
    "TRAIN_EPOCHS": 10,  # number of training epochs
    "VAL_EPOCHS": 1,  # number of validation epochs
    "LEARNING_RATE": 1e-4,  # learning rate
    "MAX_SOURCE_TEXT_LENGTH": 512,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH": 50,  # max length of target text
    "SEED": 42,  # set seed for reproducibility
}

In [264]:
def load_data(path,tokenizer):
    sources=[]
    targets=[]
    
    with open(f'data/{path}.jsonl', encoding="utf-8") as f:
        for idx, line in enumerate(f):
            line = line.strip()
            obj=json.loads(line)
            if idx>74000 and obj['language'] =='python3':
                source=obj['description']+tokenizer.sep_token+obj['solutions']
                
                for t in obj['test_cases']:
                    sources.append(source)
                    targets.append(t+tokenizer.eos_token)
                for t in obj['private_tests']:
                    sources.append(source)
                    targets.append(t+tokenizer.eos_token)
            if idx>74010:
                break

    df=pd.DataFrame()
    df['source']=sources
    df['target']=targets
    return df

In [265]:
df=load_data('seq_data2',tokenizer)
df

In [266]:
df

Unnamed: 0,source,target
0,You are given a binary string of length n (i. ...,"input: ""3\n8 5\n11011010\n7 9\n1111100\n7 11\n..."
1,You are given a binary string of length n (i. ...,"input: ""2\n8 5\n11011010\n7 9\n1111100\n""\nout..."
2,You are given a binary string of length n (i. ...,"input: ""1\n2 1\n00\n""\noutput: ""00\n""\n</s>"
3,You are given a binary string of length n (i. ...,"input: ""3\n8 5\n11011010\n7 9\n1111100\n7 11\n..."
4,You are given a binary string of length n (i. ...,"input: ""2\n8 5\n11011010\n7 9\n1111100\n""\nout..."
5,You are given a binary string of length n (i. ...,"input: ""1\n2 1\n00\n""\noutput: ""00\n""\n</s>"
6,You are given a binary string of length n (i. ...,"input: ""3\n8 5\n11011010\n7 9\n1111100\n7 11\n..."
7,You are given a binary string of length n (i. ...,"input: ""2\n8 5\n11011010\n7 9\n1111100\n""\nout..."
8,You are given a binary string of length n (i. ...,"input: ""1\n2 1\n00\n""\noutput: ""00\n""\n</s>"
9,You are given a binary string of length n (i. ...,"input: ""3\n8 5\n11011010\n7 9\n1111100\n7 11\n..."


In [37]:
class YourDataSetClass(Dataset):
    """
    Creating a custom dataset for reading the dataset and
    loading it into the dataloader to pass it to the
    neural network for finetuning the model

    """

    def __init__(
        self, dataframe, tokenizer, source_len, target_len, source_text, target_text
    ):
        """
        Initializes a Dataset class

        Args:
            dataframe (pandas.DataFrame): Input dataframe
            tokenizer (transformers.tokenizer): Transformers tokenizer
            source_len (int): Max length of source text
            target_len (int): Max length of target text
            source_text (str): column name of source text
            target_text (str): column name of target text
        """
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

    def __len__(self):
        """returns the length of dataframe"""

        return len(self.target_text)

    def __getitem__(self, index):
        """return the input ids, attention masks and target ids"""

        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        # cleaning data so as to ensure data is in string type
        source_text = " ".join(source_text.split())
        target_text = " ".join(target_text.split())

        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        target = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.summ_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze()
        target_mask = target["attention_mask"].squeeze()

        return {
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "target_ids": target_ids.to(dtype=torch.long),
            "target_ids_y": target_ids.to(dtype=torch.long),
        }

In [47]:
def inference(model,data,tokenizer,device):
    """
    Inference function for the model
    """
    model.eval()
    
    # cleaning data so as to ensure data is in string type
    source_text = " ".join(data.split())

    source = tokenizer.batch_encode_plus(
        [source_text],
        max_length=512,
        pad_to_max_length=True,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )

    source_ids = source["input_ids"].to(device, dtype = torch.long)
    source_mask = source["attention_mask"].to(device, dtype = torch.long)

    generated_ids = model.generate(
    input_ids = source_ids,
    attention_mask = source_mask, 
    max_length=150, 
    num_beams=2,
    repetition_penalty=2.5, 
    length_penalty=1.0, 
    early_stopping=True
    )

    preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
    return preds

In [55]:
device = torch.device("cuda:0")
tokenizer = RobertaTokenizer.from_pretrained('outputs/model_files0')
model = T5ForConditionalGeneration.from_pretrained('outputs/model_files0')

model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dr

In [267]:
data=df['source'][0]
print(data)


You are given a binary string of length n (i. e. a string consisting of n characters '0' and '1').

In one move you can swap two adjacent characters of the string. What is the lexicographically minimum possible string you can obtain from the given one if you can perform no more than k moves? It is possible that you do not perform any moves at all.

Note that you can swap the same pair of adjacent characters with indices i and i+1 arbitrary (possibly, zero) number of times. Each such swap is considered a separate move.

You have to answer q independent test cases.

Input

The first line of the input contains one integer q (1 ≤ q ≤ 10^4) — the number of test cases.

The first line of the test case contains two integers n and k (1 ≤ n ≤ 10^6, 1 ≤ k ≤ n^2) — the length of the string and the number of moves you can perform.

The second line of the test case contains one string consisting of n characters '0' and '1'.

It is guaranteed that the sum of n over all test cases does not exceed 10^

In [262]:
df['target'][0]

'input: "20 2\\n9 19\\n"\noutput: "82\\n"\n</s>'

In [235]:
s=tokenizer.batch_encode_plus(
            [data],
            max_length=512,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )['input_ids']
m=[tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in s]
print(m[0])

You are given an undirected connected weighted graph with N vertices and M edges that contains neither self-loops nor double edges.
The i-th (1≤i≤M) edge connects vertex a_i and vertex b_i with a distance of c_i.
Here, a self-loop is an edge where a_i = b_i (1≤i≤M), and double edges are two edges where (a_i,b_i)=(a_j,b_j) or (a_i,b_i)=(b_j,a_j) (1≤i<j≤M).
A connected graph is a graph where there is a path between every pair of different vertices.
Find the number of the edges that are not contained in any shortest path between any pair of different vertices.

Constraints

* 2≤N≤100
* N-1≤M≤min(N(N-1)/2,1000)
* 1≤a_i,b_i≤N
* 1≤c_i≤1000
* c_i is an integer.
* The given graph contains neither self-loops nor double edges.
* The given graph is connected.

Input

The input is given from Standard Input in the following format:


N M
a_1 b_1 c_1
a_2 b_2 c_2
:
a_M b_M c_M


Output

Print the number of the edges in the graph that are not contained in any shortest path between any pair of differen

In [164]:
print(data.split(tokenizer.sep_token)[1])

import sys
from itertools import accumulate

def solve():
    n, m = map(int, input().split())
    w = [[] for i in range(3)]

    for i in range(n):
        wi, ci = map(int, sys.stdin.readline().split())
        wi -= 1
        w[wi].append(ci)

    for i in range(3):
        w[i].sort(reverse=True)

    dp = [0]*(m + 1)
    used = [[0]*2 for i in range(m + 1)]

    s0 = len(w[0])
    s1 = len(w[1])

    if s0 > 0:
        dp[1] = w[0][0]
        used[1] = [1, 0]

    for i in range(2, m + 1):
        if used[i - 1][0] < s0:
            dp[i] = dp[i - 1] + w[0][used[i - 1][0]]
            used[i] = used[i - 1][:]
            used[i][0] += 1
        else:
            dp[i] = dp[i - 1]
            used[i] = used[i - 1][:]

        if used[i - 2][1] < s1 and dp[i] < dp[i - 2] + w[1][used[i - 2][1]]:
            dp[i] = dp[i - 2] + w[1][used[i - 2][1]]
            used[i] = used[i - 2][:]
            used[i][1] += 1

    pf = [0] + list(accumulate(w[2]))

    ans = max(pf[k] + dp[m - 3*k

In [268]:
io=inference(model,data,tokenizer,device)
io

['input: "3\\n8 5\\n11011010\\n7 9\\n1111011010\\n7 11\\n111111\\n" output: "010110110\\n0101101']