In [1]:
# Initialize drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Move to drive
%cd 'drive/My Drive/Thesis/Data Experimentation/A4NT'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive
/content/drive/My Drive/Thesis/Data Experimentation/A4NT


In [2]:
# installs and imports
# import argparse

import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize
import numpy as np
import re
import torch
# from termcolor import colored

# from models.char_lstm import CharLstm
from models.char_translator import CharTranslator
from utils.data_provider import DataProvider

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
# General helper functions

# Clears whitespace but retains character for re.sub
def strip_match(match):
  return match.group(0).strip()

# Joins together decimals
def fix_decimals(match):
  match = match.group(0)
  return re.sub('\s', '', match)

# Cleans text by removing unnecessary whitespace and substituting back in some symbols
def clean_text(text):
    text = re.sub('-lrb-', '(', text)
    text = re.sub('-rrb-', ')', text)
    text = re.sub('-lsb-', '[', text)
    text = re.sub('-rsb-', ']', text)
    text = re.sub('-lcb-', '{', text)
    text = re.sub('-rcb-', '}', text)
    text = re.sub('\'\'', '\"', text)
    text = re.sub('\si\s', ' I ', text)
    text = re.sub('^i\s', 'I ', text)
    text = re.sub('\sna\s', 'na ', text)
    text = re.sub('\$\s', strip_match, text)
    text = re.sub('[-#]\s|\s([-.!,\':;?]|n\'t)', strip_match, text)
    text = re.sub('\d+. \d+', fix_decimals, text)
    return text

In [0]:
# Class to contain "translator"
class translator:
  # Initialize model
  def __init__(self, filename):
    self.type = filename
    if filename == 'gender':
      device = torch.device('cpu')
      saved_model = torch.load('gender_cpu.pth.tar', map_location=device)
      # saved_model = torch.load('gender_translator.pth.tar')
    elif filename == 'age':
      device = torch.device('cpu')
      saved_model = torch.load('age_cpu.pth.tar', map_location=device)
      # saved_model = torch.load('age_translator.pth.tar')
    else:
      raise Exception("Invalid translator specified: try 'gender' or 'age'")
    # self.word_to_index = saved_model['misc']['char_to_ix']
    # self.auth_to_index = saved_model['misc']['auth_to_ix']
    # self.index_to_word = saved_model['misc']['ix_to_char']
    # self.index_to_auth = saved_model['misc']['ix_to_auth']
    # self.cp_params = saved_model['arch']
    self.word_to_index = saved_model.word_to_index
    self.auth_to_index = saved_model.auth_to_index
    self.index_to_word = saved_model.index_to_word
    self.index_to_auth = saved_model.index_to_auth
    self.cp_params = saved_model.cp_params
    self.model = CharTranslator(self.cp_params)
    self.model.eval()
    self.startc = 'START'
    self.endc = 'END'
    append_tensor = np.zeros((1, 1), dtype=np.int)
    append_tensor[0, 0] = self.word_to_index[self.startc]
    self.append_tensor = torch.LongTensor(append_tensor)#.cuda()
    # self.model.load_state_dict(saved_model['state_dict'])
    self.model.load_state_dict(saved_model.model.state_dict())
    self.model.init_hidden(1)
    self.jc = '' if self.cp_params.get('atoms', 'char') == 'char' else ' '
    self.maxlen = self.cp_params['max_seq_len']
    print("Loaded model")


  # Run a single random example as in original code
  def get_random_sentence(self):
    dp = DataProvider(self.cp_params)

    # Get random sentence
    c_aid = np.random.choice(list(self.auth_to_index.values()))
    batch = dp.get_sentence_batch(1, split='val', 
                                  atoms=self.cp_params.get('atoms', 'char'),
                                  aid=self.index_to_auth[c_aid])

    # Format sentence
    inps, targs, auths, lens = dp.prepare_data(batch, self.word_to_index, 
                                               self.auth_to_index, 
                                               maxlen=self.maxlen)
    auths_inp = 1 - auths 

    # Do translation
    self.model.eval()
    forward = self.model.forward_gen(inps, end_c=self.word_to_index[self.endc], 
                                     n_max=self.maxlen, auths=auths)

    # Print results
    print('--------------------------------------------')
    print('Translate from %s to %s' % (batch[0]['author'], self.index_to_auth[auths_inp.item()]))

    # Get original sentence and clean it up a bit
    input_list = [self.index_to_word[c.item()] for c in inps[1:]]
    input_string = (self.jc).join(input_list)
    input_string = clean_text(input_string)

    # Get translated sentence and clean it up a bit
    output_list = [self.index_to_word[c.item()] for c in forward if c.item() in self.index_to_word]
    if output_list[-1] == 'END':
        output_list = output_list[:-1]
    output_string = self.jc.join(output_list)
    output_string = clean_text(output_string)

    print('Inp %6s: ' % (self.index_to_auth[auths.item()]) + '%s' % input_string)
    print('Out %6s: ' % (self.index_to_auth[auths_inp.item()]) + '%s' % output_string)

  # Translate a single sentence (sen) from class (class_name) to the opposite class
  def translate_sentence(self, sen, class_name, verbose=True):
    
    # Switch class names to fit original model
    if self.type == 'age':  
      if class_name == 'teenager':
        original = '<20'
      elif class_name == 'adult':
        original = '<50'
      else:
        raise Exception("Invalid class specified: try 'teenager' or 'adult'")
    elif class_name not in ['male', 'female']:
      raise Exception("Invalid class specified: try 'male' or 'female'")
    else:
      original = class_name
      
    # Get other class name
    o_class = self.auth_to_index[original]
    n_class = 1 - o_class
    n_class_name = self.index_to_auth[n_class]
    if n_class_name == '<20':
      n_class_name = 'teenager'
    elif n_class_name == '<50':
      n_class_name = 'adult'
      
    # Clean string before tokenization
    s = re.sub('\"', "\' \'", sen)
    s = re.sub('\(', ' -lrb- ', s)
    s = re.sub('\)', ' -rrb- ', s)
    s = re.sub('\[', ' -lsb- ', s)
    s = re.sub('\]', ' -rsb- ', s)
    s = re.sub('\{', ' -lcb- ', s)
    s = re.sub('\}', ' -rcb- ', s)
    s = re.sub('/', ' / ', s)
    s = s.lower()
    s = re.sub('\d+', 'NUM', s)
    s = re.sub('NUM.NUM', 'NUM . NUM', s)
    # TODO: replace emojis/special characters with ELIP
    # TODO: decide how to handle hashtags
      
    # Tokenize sen
    tokenized = word_tokenize(s)
    inp = [self.startc] + tokenized
    targ = tokenized + [self.endc]

    # Translate to appropriate index
    inp_inds = [self.word_to_index[w] if w in self.word_to_index else 0 for w in inp[:self.maxlen]]
    # TODO: check if OOV is working

    # Do padding/truncating
    # ?????
    

    # Do translation
    self.model.eval()
    inp_seq = np.zeros((len(inp_inds), 1), dtype=np.int) # We only translate one sentence at a time
    inp_seq[:len(inp_inds), 0] = inp_inds
    forward = self.model.forward_gen(torch.from_numpy(inp_seq), 
                                     end_c=self.word_to_index[self.endc], 
                                     n_max=self.maxlen, 
                                     auths=torch.from_numpy(np.array([n_class])))

    # Format appropriately
    output_list = [self.index_to_word[c.item()] for c in forward if c.item() in self.index_to_word]
    if output_list[-1] == 'END':
        output_list = output_list[:-1]
    
    # Replace all 'NUM' tags with the numbers in the original sentence
    nums = re.findall('\d+', sen)
    output_list = [nums.pop(0) if w == 'NUM' else w for w in output_list]
    
    # If the first word is capitalized in the sentence, capitalize it here
    if sen[0].isupper():
      output_list[0] = output_list[0].capitalize()
    
    output_string = self.jc.join(output_list)
    output_string = clean_text(output_string)
    
    translation = output_string
    
    # Print if verbose
    if verbose:
      print("Translating from %s to %s" % (class_name, n_class_name))
      print("Input sentence: %s" % sen)
      print("Translated Sentence: %s" % translation)
    
    # Return translated sentence
    return translation

In [18]:
import pickle
age_translator = translator('gender')
# gender_translator.get_random_sentence()
print("Model initialized")

# age_translator = translator('age')
# age_translator.get_random_sentence()

a = age_translator.translate_sentence("I love you",
                                         "male")
# b = gender_translator.translate_sentence("So, I was talking to Sheryl the 
# other day and she said we should go out for dinner.", 'female')
# with open('./age_cpu.pth', 'wb') as ot:
#   pickle.dump(age_translator, ot, protocol=3)
torch.save(age_translator.model.state_dict(), './gender_cpu_sd.pth')
print("Saved")



Loaded model
Model initialized
Translating from male to female
Input sentence: I love you
Translated Sentence: I love you
Saved


In [16]:
print(age_translator.index_to_auth)

{0: 'male', 1: 'female'}
