In [1]:
import os
import requests
import zipfile
import tarfile
import json
import time
import sys
import math
import pandas as pd
import random
import logging
from argparse import ArgumentParser
from subprocess import call
import numpy as np

from collections import defaultdict
from multiprocessing import Pool
from tqdm.auto import tqdm, trange
from itertools import chain

In [5]:
def download_file(packet_url, base_path="", extract=False, headers=None):
  if base_path != "":
    if not os.path.exists(base_path):
      os.mkdir(base_path)
  packet_file = os.path.basename(packet_url)
  with requests.get(packet_url, stream=True, headers=headers) as r:
      r.raise_for_status()
      with open(os.path.join(base_path,packet_file), 'wb') as f:
          for chunk in r.iter_content(chunk_size=8192):
              f.write(chunk)
  
  if extract:
    if packet_file.endswith(".zip"):
      with zipfile.ZipFile(os.path.join(base_path,packet_file)) as zfile:
        zfile.extractall(base_path)
    else:
      packet_name = packet_file.split('.')[0]
      with tarfile.open(os.path.join(base_path,packet_file)) as tfile:
        tfile.extractall(base_path)

In [9]:
from google.colab import auth
auth.authenticate_user()

project_id = 'ai5-c1-group1'
!gcloud config set project {project_id}

!gsutil cp gs://artifacts.ai5-c1-group1.appspot.com/data/processed_dogs.csv ./

Updated property [core/project].
Copying gs://artifacts.ai5-c1-group1.appspot.com/data/processed_dogs.csv...
/ [1 files][926.6 KiB/926.6 KiB]                                                
Operation completed over 1 objects/926.6 KiB.                                    


In [11]:
dataset_url = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"
start_time = time.time()
download_file(dataset_url, base_path="datasets", extract=False)
execution_time = (time.time() - start_time)/60.0
print(execution_time)

0.03983776966730754


In [12]:
persona_chat_path = os.path.join("datasets", "personachat_self_original.json")
with open(persona_chat_path, 'r', encoding='utf-8') as f:
  personachat = json.loads(f.read())


In [13]:
train_size = len(personachat['train'])
val_size = len(personachat['valid'])
sample = personachat['train'][0]
print("Personality\n", sample['personality'])
print("\nUtterances[candidates]\n", sample['utterances'][0]['candidates'])
print("\nUtterances[history]\n", sample['utterances'][0]['history'])

# every datapt. -> 
#  dict with keys: personality, utterances
#     personality -> list of sentences
#     utterances -> list of dictionaries with keys -> candidates, history


Personality
 ['i like to remodel homes .', 'i like to go hunting .', 'i like to shoot a bow .', 'my favorite holiday is halloween .']

Utterances[candidates]
 ['my mom was single with 3 boys , so we never left the projects .', 'i try to wear all black every day . it makes me feel comfortable .', 'well nursing stresses you out so i wish luck with sister', 'yeah just want to pick up nba nfl getting old', 'i really like celine dion . what about you ?', 'no . i live near farms .', "i wish i had a daughter , i'm a boy mom . they're beautiful boys though still lucky", 'yeah when i get bored i play gone with the wind my favorite movie .', "hi how are you ? i'm eating dinner with my hubby and 2 kids .", 'were you married to your high school sweetheart ? i was .', 'that is great to hear ! are you a competitive rider ?', "hi , i'm doing ok . i'm a banker . how about you ?", "i'm 5 years old", 'hi there . how are you today ?', 'i totally understand how stressful that can be .', 'yeah sometimes yo

In [14]:
wrong_answers = []
for datapt in personachat['train'][:100]:
  for utt in datapt['utterances']:
    for ans in utt['candidates'][:-1]:
      wrong_answers.append(ans)

print(len(wrong_answers))

13965


In [17]:
dog_data = pd.read_csv('processed_dogs.csv')
dog_data.head()

Unnamed: 0,AnimalID,AnimalInternal-ID,AnimalName,AnimalSex,AnimalCurrentWeightPounds,Age,Breed,isMixed,Color
0,45628,1444011,Emma,Female,53.3,6,Retriever,True,Blond
1,45629,1444014,Rizzoli,Female,4.7,5,Mixed Breed (Small),True,Tan
2,45630,1444017,Isles,Female,3.1,5,Mixed Breed (Small),True,White
3,45631,1444020,Cory,Male,4.7,5,Mixed Breed (Small),True,Sable
4,45632,1444023,Topanga,Female,8.0,5,Mixed Breed (Small),True,Tan


In [18]:
def generate_persona(details):
  name = ["My name is {}.",
          "I am {}.",
          "It's {}. Woof Woof"]
  age = ["My age is {}.",
         "I am {} years old."]
  animaltype = ["I am dog.",
                "Woof woof I am dog."]
  if (details.AnimalSex == "Female"):
    gender = ["My gender is female.",
              "I am a girl."]
  else:
    gender = ["My gender is male.",
              "I am a boy."]
  weight = ["My weight is {} lbs.",
            "I weigh {} pounds."]
  color = ["my color is {}, it is my favourite color.",
           "I am {} in color."]
  breed = ["My breed is {}.",
           "I am a {}."]
  persona = []
  persona.append(np.random.choice(name).format(details['AnimalName']))
  persona.append(np.random.choice(age).format(details['Age']))
  persona.append(np.random.choice(animaltype))
  persona.append(np.random.choice(gender))
  persona.append(np.random.choice(weight).format(details['AnimalCurrentWeightPounds']))
  persona.append(np.random.choice(color).format(details['Color']))
  persona.append(np.random.choice(breed).format(details['Breed']))
  return persona

print(generate_persona(dog_data.iloc[0]))
print(generate_persona(dog_data.iloc[0]))
print(generate_persona(dog_data.iloc[0]))


["It's Emma. Woof Woof", 'I am 6 years old.', 'I am dog.', 'I am a girl.', 'My weight is 53.3 lbs.', 'I am Blond in color.', 'I am a Retriever.']
['My name is Emma.', 'I am 6 years old.', 'I am dog.', 'My gender is female.', 'I weigh 53.3 pounds.', 'my color is Blond, it is my favourite color.', 'My breed is Retriever.']
['My name is Emma.', 'I am 6 years old.', 'I am dog.', 'I am a girl.', 'I weigh 53.3 pounds.', 'I am Blond in color.', 'My breed is Retriever.']


In [19]:
def generate_history(dog_data):
  user = ["hi , how are you ?"]
  dog = ["woof woof . i'm feeling great!"]

  name = ["My name is {}.",
          "I am {}.",
          "It's {}. Woof Woof"]
  age = ["My age is {}.",
         "I am {} years old."]
  animaltype = ["I am dog.",
                "Woof woof I am dog."]
  if (dog_data.AnimalSex == "Female"):
    gender = ["My gender is female.",
              "I am a girl."]
  else:
    gender = ["My gender is male.",
              "I am a boy."]
  weight = ["My weight is {} lbs.",
            "I weigh {} pounds."]
  color = ["my color is {}.",
           "I am {} in color."]
  breed = ["My breed is {}.",
           "I am a {}."]
  user.append("what is your name ?")
  dog.append(np.random.choice(name).format(dog_data["AnimalName"]))
  user.append("what are you ?")
  dog.append("i am a Dog")
  user.append("what is your gender ?")
  dog.append(np.random.choice(gender).format(dog_data["AnimalSex"]))
  user.append("what is your weight ?")
  dog.append(np.random.choice(weight).format(dog_data["AnimalCurrentWeightPounds"]))
  user.append("How old are you ?")
  dog.append(np.random.choice(age).format(dog_data["Age"]))
  user.append("what breed are you ?")
  dog.append(np.random.choice(breed).format(dog_data["Breed"]))
  user.append("what color are you ?")
  dog.append(np.random.choice(name).format(dog_data["Color"]))
  


  return user, dog


hist = generate_history(dog_data.iloc[0])
print(hist[0])
print(hist[1])

['hi , how are you ?', 'what is your name ?', 'what are you ?', 'what is your gender ?', 'what is your weight ?', 'How old are you ?', 'what breed are you ?', 'what color are you ?']
["woof woof . i'm feeling great!", "It's Emma. Woof Woof", 'i am a Dog', 'My gender is female.', 'My weight is 53.3 lbs.', 'I am 6 years old.', 'I am a Retriever.', "It's Blond. Woof Woof"]


In [20]:
# Create dataset
persona_dog_chat = []

for row in dog_data.iloc:
  persona = generate_persona(row)
  user, dog = generate_history(row)

  history = []
  utterances = []
  for i in range(len(user)):
    # randomly pick number of wrong answers range[5, 15]
    n_utter = np.random.randint(5,16)
    candidates = random.sample(wrong_answers, n_utter)
    candidates.append(dog[i])
    
    if (i > 0):
      history.append(dog[i-1])
    history.append(user[i])

    utterance = {"candidates":candidates,
                 "history":history.copy()}
    utterances.append(utterance)
  persona_dog_chat.append({
      "personality":persona,
      "utterances":utterances
  })

print(len(persona_dog_chat))

16885


In [21]:
print("personality:",persona_dog_chat[0]["personality"])
print("candidates:",persona_dog_chat[0]["utterances"][0]["candidates"])
print("history:",persona_dog_chat[0]["utterances"][0]["history"])

personality: ['My name is Emma.', 'My age is 6.', 'I am dog.', 'My gender is female.', 'I weigh 53.3 pounds.', 'I am Blond in color.', 'My breed is Retriever.']
candidates: ["i'm a fraud . you are the 1st person that i told . 15 , 000 .", 'i like all kinds of music , but am not very familiar with opera', "i'm not married and my kids are grown . how about you ?", "i'm doing well , how are you ?", 'i lover russian rock and roll . you ?', 'i like to color and i wear a helmet .', 'i sure hope so sounds like she ate something bad', 'yeah , its def not boring . any plans for the weekend ?', "woof woof . i'm feeling great!"]
history: ['hi , how are you ?']


In [22]:
persona_dog_chat[0]

{'personality': ['My name is Emma.',
  'My age is 6.',
  'I am dog.',
  'My gender is female.',
  'I weigh 53.3 pounds.',
  'I am Blond in color.',
  'My breed is Retriever.'],
 'utterances': [{'candidates': ["i'm a fraud . you are the 1st person that i told . 15 , 000 .",
    'i like all kinds of music , but am not very familiar with opera',
    "i'm not married and my kids are grown . how about you ?",
    "i'm doing well , how are you ?",
    'i lover russian rock and roll . you ?',
    'i like to color and i wear a helmet .',
    'i sure hope so sounds like she ate something bad',
    'yeah , its def not boring . any plans for the weekend ?',
    "woof woof . i'm feeling great!"],
   'history': ['hi , how are you ?']},
  {'candidates': ['oh . do you have any pets ?',
    'wow amazing do you have kids ?',
    'i make an awesome cheesecake one of my favorite desserts',
    'i like to make clothes by sewing',
    'i am trying to save up so i do not have to take out loans .',
    "oh a

In [23]:
with open("personadogchat.json", "w") as outfile:  
    json.dump(persona_dog_chat, outfile) 

In [24]:
# Push to cloud
!gsutil cp ./personadogchat.json gs://artifacts.ai5-c1-group1.appspot.com/data

Copying file://./personadogchat.json [Content-Type=application/json]...
\
Operation completed over 1 objects/102.2 MiB.                                    
