In [1]:
# Manipulation
import numpy as np
import pandas as pd

# Data

In [2]:
from datasets import Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Read data and minor preprocessing
df = pd.read_csv("data2.csv")
df = df.drop(columns=["Unnamed: 0", "Repository"]).dropna()
df.head()

Unnamed: 0,Message,Diff
0,Remove Romanian translation (abandoned)\n\nThe...,diff --git a/translations.md b/translations.md...
1,Add link to Real Python on editor,diff --git a/first_steps.md b/first_steps.md\n...
2,Fix typo,diff --git a/README.md b/README.md\nindex 85a9...
3,Remove whitespace to fix header rendering\n\nT...,diff --git a/problem_solving.md b/problem_solv...
4,Fix sentence grammar\n\nThanks to John Thomas.,diff --git a/basics.md b/basics.md\nindex 3729...


In [4]:
df = df[df["Diff"].str.len() < 5000]
df

Unnamed: 0,Message,Diff
0,Remove Romanian translation (abandoned)\n\nThe...,diff --git a/translations.md b/translations.md...
1,Add link to Real Python on editor,diff --git a/first_steps.md b/first_steps.md\n...
2,Fix typo,diff --git a/README.md b/README.md\nindex 85a9...
3,Remove whitespace to fix header rendering\n\nT...,diff --git a/problem_solving.md b/problem_solv...
4,Fix sentence grammar\n\nThanks to John Thomas.,diff --git a/basics.md b/basics.md\nindex 3729...
...,...,...
535,Create a list of written assignments\n\nWritte...,diff --git a/written assignments/a list of wri...
536,Create text_based_calculator\n\nThe first codi...,diff --git a/coding projects/text_based_calcul...
537,Create 1_numbers_in_python\n\nPower point file...,diff --git a/power_points/1_numbers_in_python ...
538,Add files via upload\n\nFirst lesson of the co...,diff --git a/1_numbers_in_python.py b/1_number...


In [5]:
# create dataset objects
data = Dataset.from_pandas(df, preserve_index=False)
data

Dataset({
    features: ['Message', 'Diff'],
    num_rows: 461
})

In [6]:
data_train_test = data.train_test_split(test_size=50)
data_train_val = data_train_test["train"].train_test_split(test_size=55)


ds = DatasetDict({
    "train": data_train_val["train"],
    "validation": data_train_val["test"],
    "test": data_train_test["test"]
})

ds

DatasetDict({
    train: Dataset({
        features: ['Message', 'Diff'],
        num_rows: 356
    })
    validation: Dataset({
        features: ['Message', 'Diff'],
        num_rows: 55
    })
    test: Dataset({
        features: ['Message', 'Diff'],
        num_rows: 50
    })
})

In [7]:
## Hugging Face 
import transformers

transformers.__version__

'4.32.1'

# Preprocess + Tokenize

In [8]:
from transformers import AutoTokenizer

In [9]:

# Instantiate Tokenizer 
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Preprocess + Tokenizer Params
prefix = "summarize: "
max_feature_length = 256
max_target_length = 128

def preprocess_data(examples):
    # Process + tokenize features
    inputs = [prefix + doc for doc in examples["Diff"]]
    model_inputs = tokenizer(inputs, max_length=max_feature_length, truncation=True)
    
    print(type(model_inputs))
    
    # tokenize targets
    labels = tokenizer(examples["Message"], max_length=max_target_length, truncation=True)
        
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

In [28]:
tokenized_datasets = ds.map(preprocess_data, batched=True)
tokenized_datasets

Map: 100%|██████████| 356/356 [00:00<00:00, 1457.50 examples/s]


<class 'transformers.tokenization_utils_base.BatchEncoding'>


Map: 100%|██████████| 55/55 [00:00<00:00, 2891.90 examples/s]


<class 'transformers.tokenization_utils_base.BatchEncoding'>


Map: 100%|██████████| 50/50 [00:00<00:00, 2210.72 examples/s]

<class 'transformers.tokenization_utils_base.BatchEncoding'>





DatasetDict({
    train: Dataset({
        features: ['Message', 'Diff', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 356
    })
    validation: Dataset({
        features: ['Message', 'Diff', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 55
    })
    test: Dataset({
        features: ['Message', 'Diff', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 50
    })
})

# Training Params

In [20]:
# imports
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

import evaluate # add to requirements

In [14]:
# params and args

batch_size = 8
model_name = "t5-small-cte"
model_dir = f"../../saved_models/{model_name}"

args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    # fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1"
)

In [24]:
# Data Collator

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)


In [25]:

# Load metric? with evaluation function

rouge = evaluate.load("rouge")

import numpy as np

def compute_metrics(eval_pred):
    """Takes a tuple of predictions and reference labels as input, 
    and outputs a dictionary of metrics computed over the inputs."""
    
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

# Actually training

In [27]:
# Load model from checkpoint - loads up T5 with weights and architecture

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading (…)neration_config.json: 100%|██████████| 147/147 [00:00<00:00, 1.08MB/s]


In [29]:
# Thing that does the training - honestly don't know why it's this hard.

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [30]:
# Train, apparently

trainer.train()

  0%|          | 0/135 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 74%|███████▍  | 100/135 [04:23<01:03,  1.83s/it]

{'loss': 4.2717, 'learning_rate': 1.037037037037037e-05, 'epoch': 2.22}


                                                 
 74%|███████▍  | 100/135 [04:33<01:03,  1.83s/it]

{'eval_loss': 3.487128973007202, 'eval_rouge1': 0.216, 'eval_rouge2': 0.1481, 'eval_rougeL': 0.1954, 'eval_rougeLsum': 0.1952, 'eval_gen_len': 17.9455, 'eval_runtime': 9.8555, 'eval_samples_per_second': 5.581, 'eval_steps_per_second': 0.71, 'epoch': 2.22}


100%|██████████| 135/135 [05:50<00:00,  2.60s/it]

{'train_runtime': 350.6779, 'train_samples_per_second': 3.046, 'train_steps_per_second': 0.385, 'train_loss': 4.180418113425926, 'epoch': 3.0}





TrainOutput(global_step=135, training_loss=4.180418113425926, metrics={'train_runtime': 350.6779, 'train_samples_per_second': 3.046, 'train_steps_per_second': 0.385, 'train_loss': 4.180418113425926, 'epoch': 3.0})

In [31]:
trainer.save_model("../../saved_models/t5-small-cte-lorcan")

# Inference

Way 1: Use pipeline

In [32]:
from transformers import pipeline

In [34]:
pipe = pipeline("summarization", model="../../saved_models/t5-small-cte-lorcan")

In [48]:
pipe(
    "@staticmethod\n def __check_dictionary(word):\n '''Check if word exists in English dictionary'''\n response = requests.get(f'https://wagon-dictionary.herokuapp.com/{word}')\n json_response = response.json()\n        return json_response[\"found\"]", max_length=30
)

[{'summary_text': "''Check if word exists in English dictionary'' response = requests.get(f'https://wagon-dictionary"}]

Way 2: The hard way I think?

In [46]:
pred_text = "@staticmethod\n def __check_dictionary(word):\n '''Check if word exists in English dictionary'''\n response = requests.get(f'https://wagon-dictionary.herokuapp.com/{word}')\n json_response = response.json()\n        return json_response[\"found\"]"

inputs = ["summarize: " + pred_text]

inputs = tokenizer(inputs, max_length=max_feature_length, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=10, max_length=64)
decoder_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
decoder_output

"''Check if word exists in English dictionary''' response = requests.get(f'https://wagon-dictionary.herokuapp.com/word''"

more data on a bigger model

# Base inference on T5

In [62]:
as_list = pd.read_csv("data2.csv").dropna()
as_list = as_list["Diff"].tolist()

import random

i = 0
num_indexes = 10
rand_indexes = []


while i < num_indexes:
    r = random.randint(0, len(as_list) - 1)
    
    if r not in rand_indexes:
        rand_indexes.append(r)
        i += 1

rand_indexes 

[114, 25, 281, 250, 228, 142, 104, 89, 432, 32]

In [68]:
t5_small = pipeline("summarization", "t5-small")

for i, ind in enumerate(rand_indexes):
    print(i, t5_small(as_list[ind], max_length=30)[0]["summary_text"])

Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors


0 assert statement is used to assert that something is true . if the assert statement fails, an As
1 - 11 is represented in bits by 1011 which when right shifted by 1 bit gives 101
2 a/fabfile.py index 8cdcae9a..1f0432f8 100755 .
3 a/07-operators-expressions.md index c99c20d5..a0b5d2e
4 git a/frontpage.asciidoc +++ b/front page.acidoc @@ 
5 git a/programs/backup_ver3.py index f6ff4fc3..d6
6 ## Chinese +**The following URLs are unavailable now . translations are available at http://woodpecker.org.
7 a/data_structures.md b243b025..c4c4ab40 100644 .
8 anyhoo, below is a comprehensive summary of all of the cool coding projects that accompanies the
9 a/.github/workflows/main.yml index 13833b57..a72a5811


Cleaning of data is required. 

In [69]:
flan_t5_small = pipeline("summarization", "google/flan-t5-small")

for i, ind in enumerate(rand_indexes):
    print(i, flan_t5_small(as_list[ind], max_length=30)[0]["summary_text"])

Token indices sequence length is longer than the specified maximum sequence length for this model (520 > 512). Running this sequence through the model will result in indexing errors


0 python >>> mylist = ['item']  pytheon = 
1 11 is represented in bits by 1011 which when right shifted by 1 bit gives 101which is
2 # TODO Use a proper category instead + "search": "python_en", "offset": offset
3 a/07-operators-expressions.md index c99c20d5..a0b5d2e
4 b/frontpage.asciidoc @@ -60,6 +60,11 @@ A Byte of
5 if len(comment) == 0: target = today + os.sep + now + '
6 b/translations.md index d75b6a4c..dc102e1a 100644
7 # parentheses not required but are a good idea print('Number of cages in the new zoo is
8 # Text Based Calculator -The massive headline pretty much said it all. Anyhoo, below is a comprehensive summary of all
9 b/.github/workflows/main.yml index 13833b57..a72a5811
