In [None]:
import os
import math
import cv2
import numpy as np
import pandas as pd
from imutils import paths
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import time
from sklearn.model_selection import train_test_split
import tensorflow.keras.backend as K

from IPython.display import clear_output

In [None]:
!pip install transformers

from transformers import GPT2Tokenizer, TFGPT2LMHeadModel, GPT2Config
clear_output()

In [None]:
from google.colab import auth
auth.authenticate_user()

# https://cloud.google.com/resource-manager/docs/creating-managing-projects
project_id = 'ai5-c1-group1'
!gcloud config set project {project_id}

Updated property [core/project].


In [None]:
# Download the file from a given Google Cloud Storage bucket.
!gsutil cp gs://artifacts.ai5-c1-group1.appspot.com/data/processed_dogs.csv ./

Copying gs://artifacts.ai5-c1-group1.appspot.com/data/processed_dogs.csv...
/ [1 files][926.6 KiB/926.6 KiB]                                                
Operation completed over 1 objects/926.6 KiB.                                    


In [None]:
data = pd.read_csv('./processed_dogs.csv')
data.head()

Unnamed: 0,AnimalID,AnimalInternal-ID,AnimalName,AnimalSex,AnimalCurrentWeightPounds,Age,Breed,isMixed,Color
0,45628,1444011,Emma,Female,53.3,6,Retriever,True,Blond
1,45629,1444014,Rizzoli,Female,4.7,5,Mixed Breed (Small),True,Tan
2,45630,1444017,Isles,Female,3.1,5,Mixed Breed (Small),True,White
3,45631,1444020,Cory,Male,4.7,5,Mixed Breed (Small),True,Sable
4,45632,1444023,Topanga,Female,8.0,5,Mixed Breed (Small),True,Tan


In [None]:
generic_questions = ["Hi",
               "Hello, How are you",
               "Are you pretty",
               "What is your name",
               "What is your color",
               "What is your age",
               "How old are you",
               "What is your weight",
               "What is your gender",
               "What breed are you"]

generic_description = ' '.join([
            "I have the prettiest little puppy face.",
            "I am sweet.",
            "I have stunning grey eyes that will win you over instantly, and have the cutest floppy ears.",
            "I am still learning what my crate is for, and working hard to master house training.",
            "I love crinkly stuffed toys.",
            "I am very low key and relaxed.",
            "I love to be held, and will cuddle in your lap to take a snooze."])
n = len(generic_questions)

def generate_context(details):
    persona = [f'My name is {details.AnimalName}.',
                f'I am {details.Age} years old.',
                f'My color is {details.Color}.',
                f'My weight is {details.AnimalCurrentWeightPounds}.',
                f'My gender or sex is {details.AnimalSex}.',
                f'My breed is {details.Breed}.']
    return n * [' '.join(persona) + " " + generic_description]

def generate_qa(details):
  
  answers = [f"Hi I am {details.AnimalName}. Woof Woof!",
             f"Hi I am good. How you doin?",
             f"I am the prettiest! woof woof !",
             f"I am {details.AnimalName}. woof woof !",
             f"My color is {details.Color}. It's your favourite color isn't it? ",
             f"I am {details.Age} years old. I am the cutest.",
             f"I am {details.Age} years old. I am the cutest.",
             f"I weigh {details.AnimalCurrentWeightPounds} lbs. I am a perfect family dog.",
             f"I am a {details.AnimalSex} dog. I am the best.",
             f"My breed is {details.Breed}. I love to make friends."]
             
  return generic_questions, answers

In [None]:
generate_context(data.iloc[0])[0]

'My name is Emma. I am 6 years old. My color is Blond. My weight is 53.3. My gender or sex is Female. My breed is Retriever. I have the prettiest little puppy face. I am sweet. I have stunning grey eyes that will win you over instantly, and have the cutest floppy ears. I am still learning what my crate is for, and working hard to master house training. I love crinkly stuffed toys. I am very low key and relaxed. I love to be held, and will cuddle in your lap to take a snooze.'

In [None]:
q, a = generate_qa(data.iloc[0])
for i, j in zip(q, a):
  print(i)
  print(j)
  print('============================')

Hi
Hi I am Emma. Woof Woof!
Hello, How are you
Hi I am good. How you doin?
Are you pretty
I am the prettiest! woof woof !
What is your name
I am Emma. woof woof !
What is your color
My color is Blond. It's your favourite color isn't it? 
What is your age
I am 6 years old. I am the cutest.
How old are you
I am 6 years old. I am the cutest.
What is your weight
I weigh 53.3 lbs. I am a perfect family dog.
What is your gender
I am a Female dog. I am the best.
What breed are you
My breed is Retriever. I love to make friends.


In [None]:
questions = []
answers = []
context = []

for row in data.iloc:
  context.extend(generate_context(row))
  qa = generate_qa(row)
  questions.extend(qa[0])
  answers.extend(qa[1])

assert len(context) == len(questions) == len(answers)

In [None]:
pd.DataFrame({"context" : context, "questions":questions, "answers":answers})

Unnamed: 0,context,questions,answers
0,My name is Emma. I am 6 years old. My color is...,Hi,Hi I am Emma. Woof Woof!
1,My name is Emma. I am 6 years old. My color is...,"Hello, How are you",Hi I am good. How you doin?
2,My name is Emma. I am 6 years old. My color is...,Are you pretty,I am the prettiest! woof woof !
3,My name is Emma. I am 6 years old. My color is...,What is your name,I am Emma. woof woof !
4,My name is Emma. I am 6 years old. My color is...,What is your color,My color is Blond. It's your favourite color i...
...,...,...,...
168845,My name is Nola. I am 1 years old. My color is...,What is your age,I am 1 years old. I am the cutest.
168846,My name is Nola. I am 1 years old. My color is...,How old are you,I am 1 years old. I am the cutest.
168847,My name is Nola. I am 1 years old. My color is...,What is your weight,I weigh 36.4 lbs. I am a perfect family dog.
168848,My name is Nola. I am 1 years old. My color is...,What is your gender,I am a Female dog. I am the best.


In [None]:
tokenized_text = []
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", pad_token_id=50256)

for c, q, a in zip(context, questions, answers):
  tok = tokenizer.encode(c+" "+q+ " "+a)
  tokenized_text.append(tok)

print(len(tokenized_text))
print(len(tokenized_text[0]), tokenized_text[0][:20])

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

168850
136 [3666, 1438, 318, 18966, 13, 314, 716, 718, 812, 1468, 13, 2011, 3124, 318, 1086, 623, 13, 2011, 3463, 318]


In [None]:
padded_text = tf.keras.preprocessing.sequence.pad_sequences(
                  tokenized_text, padding='post', value=tokenizer.eos_token_id
)

X = []
y = []
for seq in padded_text:
  X.append(seq[:-1])
  y.append(seq[1:])

print(len(X), len(y))

168850 168850


In [None]:
X[0].shape, y[0].shape

((159,), (159,))

In [None]:
train_x, test_x, train_y, test_y = train_test_split(X, y, train_size=.8)

AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 12
TRAIN_SHUFFLE_BUFFER_SIZE = len(train_x)
TEST_SHUFFLE_BUFFER_SIZE = len(test_x)


# Create TF Dataset
train_data = tf.data.Dataset.from_tensor_slices((train_x, train_y))

#############
# Train data
#############
train_data = train_data.shuffle(buffer_size=TRAIN_SHUFFLE_BUFFER_SIZE)
train_data = train_data.batch(BATCH_SIZE, drop_remainder=True)
train_data = train_data.prefetch(buffer_size=AUTOTUNE)

print("train_data",train_data)


# Create TF Dataset
test_data = tf.data.Dataset.from_tensor_slices((test_x, test_y))

#############
# Test data
#############
test_data = test_data.shuffle(buffer_size=TEST_SHUFFLE_BUFFER_SIZE)
test_data = test_data.batch(BATCH_SIZE, drop_remainder=True)
test_data = test_data.prefetch(buffer_size=AUTOTUNE)

print("test_data",test_data)

train_data <PrefetchDataset shapes: ((12, 159), (12, 159)), types: (tf.int32, tf.int32)>
test_data <PrefetchDataset shapes: ((12, 159), (12, 159)), types: (tf.int32, tf.int32)>


In [None]:
############################
# Training Params
############################
learning_rate = 5e-6 
epsilon=1e-08
clipnorm=1.0
epochs = 1

# Free up memory
K.clear_session()

# Build the model
model = TFGPT2LMHeadModel.from_pretrained("gpt2")

# Print the model architecture
print(model.summary())

# Optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon, clipnorm=clipnorm)
# Loss
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

# Compile
model.compile(loss=[loss, *[None] * model.config.n_layer],
                  optimizer=optimizer,
                  metrics=[metric])

# Train model
start_time = time.time()
training_results = model.fit(
        train_data, 
        epochs=epochs, 
        verbose=1)
execution_time = (time.time() - start_time)/60.0
print("Training execution time (mins)",execution_time)

Downloading:   0%|          | 0.00/475M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Model: "tfgp_t2lm_head_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
transformer (TFGPT2MainLayer multiple                  124439808 
Total params: 124,439,808
Trainable params: 124,439,808
Non-trainable params: 0
_________________________________________________________________
None
Training execution time (mins) 204.70258458455405


In [None]:
model_dir = "trained_model_gpt2"
os.makedirs(model_dir, exist_ok=True)

model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

!zip -r finetuned_model_gpt2.zip trained_model_gpt2

  adding: trained_model_gpt2/ (stored 0%)
  adding: trained_model_gpt2/vocab.json (deflated 63%)
  adding: trained_model_gpt2/config.json (deflated 51%)
  adding: trained_model_gpt2/tokenizer_config.json (deflated 57%)
  adding: trained_model_gpt2/special_tokens_map.json (deflated 72%)
  adding: trained_model_gpt2/tf_model.h5 (deflated 7%)
  adding: trained_model_gpt2/merges.txt (deflated 53%)


In [None]:
# Copy the file to our new bucket.
# Full reference: https://cloud.google.com/storage/docs/gsutil/commands/cp
!gsutil cp ./finetuned_model_gpt2.zip gs://artifacts.ai5-c1-group1.appspot.com/data

Copying file://./finetuned_model_gpt2.zip [Content-Type=application/zip]...
/ [0 files][    0.0 B/441.5 MiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

-
Operation completed over 1 objects/441.5 MiB.                                    


In [None]:
# model = TFGPT2LMHeadModel.from_pretrained("./trained_model")
# tokenizer = GPT2Tokenizer.from_pretrained("./trained_model")

def chatbot(context):
  while (True):
    question = input().strip()
    if (question == "exit"):   break
    # history = history+ " "+question
    query = context + question
    input_ids = tokenizer.encode(query, return_tensors='tf')
    outputs = model.generate(
        input_ids, 
        num_beams = 2,
        pad_token_id=50256,
        top_p=.8,
        top_k = 2,
        max_length=300
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # history += " "+answer
    print("                 ->", answer[len(query):])
    # print(answer)

chatbot(np.random.choice(context))

Hi
                 ->  Hi I am Egypt. Woof Woof!
What breed are you
                 ->  My breed is Retriever. I love to make friends.
How old are you
                 ->  I am 6 years old. I am the cutest.
Are you trained
                 ->  hard I am hard working to master house training. I love to make friends.
exit


In [None]:
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

test_text = context[0]

test_questions = questions[:8]

while True:
    question = input().strip()
    if (question == "exit"):  break
    inputs = tokenizer(question, test_text, add_special_tokens=True, return_tensors="tf")
    input_ids = inputs["input_ids"].numpy()[0]
    outputs = model(inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits
    # Get the most likely beginning of answer with the argmax of the score
    answer_start = tf.argmax(answer_start_scores, axis=1).numpy()[0]
    # Get the most likely end of answer with the argmax of the score
    answer_end = tf.argmax(answer_end_scores, axis=1).numpy()[0] + 1
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    # print(f"Question: {question}")
    print("                 ->", answer)

All model checkpoint layers were used when initializing TFBertForQuestionAnswering.

All the layers of TFBertForQuestionAnswering were initialized from the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


Hi What is your name
                 -> emma
How old are you
                 -> 6 years old
Are you cute
                 -> i have stunning grey eyes that will win you over instantly
Do you like toys
                 -> i love crinkly stuffed toys
Are you trained
                 -> working hard to master house training
exit


In [None]:
model_dir = "trained_model_bert"
os.makedirs(model_dir, exist_ok=True)

model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

!zip -r pretrained_bert.zip trained_model_bert

  adding: trained_model_bert/ (stored 0%)
  adding: trained_model_bert/tokenizer_config.json (deflated 39%)
  adding: trained_model_bert/tokenizer.json (deflated 59%)
  adding: trained_model_bert/config.json (deflated 46%)
  adding: trained_model_bert/special_tokens_map.json (deflated 40%)
  adding: trained_model_bert/vocab.txt (deflated 53%)
  adding: trained_model_bert/tf_model.h5 (deflated 7%)


In [None]:
!gsutil cp ./pretrained_bert.zip gs://artifacts.ai5-c1-group1.appspot.com/data

Copying file://./pretrained_bert.zip [Content-Type=application/zip]...
/ [0 files][    0.0 B/  1.2 GiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

/
Operation completed over 1 objects/1.2 GiB.                                      
