# Setup


### Instalacion de paquetes

In [None]:
# version de transformers para usar PPLM
!pip install transformers==4.4.2 datasets==1.5.0 nlp colorama==0.4.4 

Collecting transformers==4.4.2
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 18.8MB/s 
[?25hCollecting datasets==1.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/54/90/43b396481a8298c6010afb93b3c1e71d4ba6f8c10797a7da8eb005e45081/datasets-1.5.0-py3-none-any.whl (192kB)
[K     |████████████████████████████████| 194kB 54.2MB/s 
[?25hCollecting nlp
[?25l  Downloading https://files.pythonhosted.org/packages/09/e3/bcdc59f3434b224040c1047769c47b82705feca2b89ebbc28311e3764782/nlp-0.4.0-py3-none-any.whl (1.7MB)
[K     |████████████████████████████████| 1.7MB 48.2MB/s 
[?25hCollecting colorama==0.4.4
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https:

### Imports

In [None]:
import torch, os, re, pandas as pd, json
from sklearn.model_selection import train_test_split
from transformers import DataCollatorForLanguageModeling, DataCollatorWithPadding, GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, AutoConfig
from datasets import Dataset


In [None]:
torch.cuda.is_available()

True

In [None]:
import transformers
transformers.__version__

'4.4.2'

### Directorio de trabajo

In [None]:
# apuntar a google drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
os.getcwd()

'/content'

In [None]:
# contenido de nuestra directorio raiz en drive
!ls

drive  sample_data


In [None]:
os.chdir('/content/drive/MyDrive')

In [None]:
os.chdir('/content/drive/MyDrive/spainai_webinar')

In [None]:
device = "cuda:0"

# Generación de texto con GPT-2

## Carga del tokenizer y modelo

### Modelo

In [None]:
# opciones: ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl']
base_model = GPT2LMHeadModel.from_pretrained('gpt2')

device = "cuda:0"

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




In [None]:
base_model.num_parameters
# (wte): Embedding(50262, 768)
#     (wpe): Embedding(1024, 768)

<bound method ModuleUtilsMixin.num_parameters of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0

### Tokenizer


In [None]:
base_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…




In [None]:
print('Palabras en el vocabulario: ', base_tokenizer.vocab_size)

Palabras en el vocabulario:  50257


In [None]:
vocabulary = base_tokenizer.get_vocab()
vocabulary['Hi']

17250

In [None]:
text = "Hi, I'm Victor and I work as a Data Scientist"
base_tokenizer.tokenize(text)

['Hi',
 ',',
 'ĠI',
 "'m",
 'ĠVictor',
 'Ġand',
 'ĠI',
 'Ġwork',
 'Ġas',
 'Ġa',
 'ĠData',
 'ĠScientist']

In [None]:
text_ids = base_tokenizer.encode(text, return_tensors = 'pt')
text_ids

# tensorflow
#text_ids = base_tokenizer.encode(text, return_tensors = 'tf')

tensor([[17250,    11,   314,  1101, 12622,   290,   314,   670,   355,   257,
          6060, 33374]])

## Métodos y parámetros de decodificación


In [None]:
text = "I work as a data scientist"
text_ids = base_tokenizer.encode(text, return_tensors = 'pt')

# text_ids = text_ids.to(device)

In [None]:
def pretty_print(text, max_len_line=100):
  words = text.split(' ')
  len_line = 0
  line = ''
  for w in words:
    if w == '\n':
      print(line)
      line = ''
      continue
    if (len(line) + len(w)) > max_len_line:
      print(line)
      line = ''
    line += ' ' + w
  print(line)

#### Greedy Search
Selecciona como siguiente palabra aquella que tenga una mayor probabilidad entre todas las posibles

In [None]:
generated_text_samples = base_model.generate(
    text_ids,
    max_length= 100,
)

for i, beam in enumerate(generated_text_samples):
  print(f"{i}: {base_tokenizer.decode(beam, skip_special_tokens=True)}")
  print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: I work as a data scientist at the University of California, Berkeley.

"I'm not a scientist, but I'm a data scientist," he said. "I'm not a data scientist, but I'm a data scientist."

He said he's not sure how much of the data he's collecting is from the government, but he's confident that it's not too much.

"I'm not going to be able to do that," he said. "I'm



Es una generación determinista, si volvemos a generar texto con el mismo inicio, el texto obtenido es el mismo.

In [None]:
# ejemplo de generación de texto
generated_text_samples = base_model.generate(
    text_ids,
    max_length= 100,
)

for i, beam in enumerate(generated_text_samples):
  print(f"{i}: {base_tokenizer.decode(beam, skip_special_tokens=True)}")
  print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: I work as a data scientist at the University of California, Berkeley.

"I'm not a scientist, but I'm a data scientist," he said. "I'm not a data scientist, but I'm a data scientist."

He said he's not sure how much of the data he's collecting is from the government, but he's confident that it's not too much.

"I'm not going to be able to do that," he said. "I'm



#### Beam Search
Mantiene las B secuencias con mayor probabilidad de cada paso en memoria, eligiendo finalmente aquella con mayor probabilidad.
El parametro B se corresponde con ```num_beams```:

In [None]:
# ejemplo de generación de texto
generated_text_samples = base_model.generate(
    text_ids,
    max_length= 50,  
    num_beams=5,
    num_return_sequences= 5,
    early_stopping=True 
)

for i, beam in enumerate(generated_text_samples):
  pretty_print(f"{i}: {base_tokenizer.decode(beam, skip_special_tokens=True)}")
  print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 0: I work as a data scientist at the University of California, Berkeley, and I've been working on
 this project for a long time. I've been working on this project for a long time. I've been working
 on this project for a long time

 1: I work as a data scientist at the University of California, Berkeley, and I've been working on
 this for a long time. I've been working on this for a long time. I've been working on this for a
 long time.



 2: I work as a data scientist at the University of California, Berkeley, and I've been working on
 this for a long time. I've been working on this for a long time. I've been working on this for a
 long time. I've

 3: I work as a data scientist at the University of California, Berkeley, and I've been working on
 this for a long time. I've been working on this for a long time. I've been working on this for a
 long time. I'm

 4: I work as a data scientist at the University of California, Berkeley, and I've been working on
 this for a long time. I've

Para evitar repeticiones del mismo texto podemos configurar un parámetro para impedir que se repitan n-gramas de la longitud deseada (```no_repeat_ngram_size```):

In [None]:
# ejemplo de generación de texto
generated_text_samples = base_model.generate(
    text_ids,
    max_length= 50,  
    num_beams=5,
    no_repeat_ngram_size=2,
    num_return_sequences= 5,
    early_stopping=True 
)

for i, beam in enumerate(generated_text_samples):
  print(f"{i}: {base_tokenizer.decode(beam, skip_special_tokens=True)}")
  print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: I work as a data scientist at the University of California, Berkeley, and I've been working on this for a long time.

I have a lot of work to do, but I want to share it with you because I think it's

1: I work as a data scientist at the University of California, Berkeley, and I've been working on this for a long time.

I have a lot of work to do, but I want to share with you some of the things that I

2: I work as a data scientist at the University of California, Berkeley, and I've been working on this for a long time.

I have a lot of work to do, but I want to share it with you because it's important to

3: I work as a data scientist at the University of California, Berkeley, and I've been working on this for a long time.

I have a lot of work to do, but I want to share it with you because it's important.

4: I work as a data scientist at the University of California, Berkeley, and I've been working on this for a long time.

I have a lot of work to do, but I want to share it with y

#### Sampling

In [None]:
# ejemplo de generación de texto
generated_text_samples = base_model.generate(
    text_ids,
    max_length= 50,  
    do_sample=True,  
    top_k=0,
    num_return_sequences= 5
)

for i, beam in enumerate(generated_text_samples):
  pretty_print(f"{i}: {base_tokenizer.decode(beam, skip_special_tokens=True)}")
  print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 0: I work as a data scientist and distributor. Compute data knowledge on production companies with
 an approach that allows you to explore high-level, high performance models that saw some crashes.

 1: I work as a data scientist on several city and county offices and I've spent many many hours with
 people that happen to like me. When my leg is broken I can get medical indeterminate problems. My

 2: I work as a data scientist at AlecGenetics, both in my prefrontal cortex and my scalp. For
 example, batting average since I was 8-year-old was more than 500 points higher. Soon after,

 3: I work as a data scientist in a number of sectors, including SAP, Google, CID, Apple, best
 practices, clustering, mapping and data manipulation". However, Simon is breaking out soon and has

 4: I work as a data scientist at Stratfor, where I've talked about it on some good lengths at a
 number of shows. My primary focus and foremost goal was to return many messages out of groups and



Ahora podemos probar a ajustar el parámetro ```temperature```

In [None]:
# ejemplo de generación de texto
generated_text_samples = base_model.generate(
    text_ids,
    max_length= 50,  
    do_sample=True,  
    top_k=0,
    temperature=0.9,
    num_return_sequences= 5
)

for i, beam in enumerate(generated_text_samples):
  pretty_print(f"{i}: {base_tokenizer.decode(beam, skip_special_tokens=True)}")
  print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 0: I work as a data scientist. All of my other fields come from years of research, so I know they
 benefit real physical scientists. So I'm a kind of a crusader for open data that's going to do away

 1: I work as a data scientist at Stardust Research. Blu gives me a home computer in my garden. She
 owns a double-track truck, a touring dog, and some furniture. I've been a homeless man all my life.

 2: I work as a data scientist at a group that solicits and enables data scientists in their field to
 understand, properly question and engage with data scientists. I did that with the data scientists

 3: I work as a data scientist at a global health company, and I told them it was foolproof to shut
 down a website they had started. They asked me how they could know who I was. Knowing that when a

 4: I work as a data scientist and lead a Data Analytics team on the IBM Distributed Systems Group,
 studying the use of remote workers in data science data and exploration.

Alton has access to

##### Top-K Sampling

In [None]:
# ejemplo de generación de texto
generated_text_samples = base_model.generate(
    text_ids,
    max_length= 50,  
    do_sample=True,  
    top_k=25,
    num_return_sequences= 5
)

for i, beam in enumerate(generated_text_samples):
  pretty_print(f"{i}: {base_tokenizer.decode(beam, skip_special_tokens=True)}")
  print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 0: I work as a data scientist and as a technical writer and I do not publish my writing here for

 1: I work as a data scientist for some other companies and they asked me whether I felt the same
 way. I've worked for the government of India for 20 years and they didn't give me any sort of

 2: I work as a data scientist in a few small firms. In the fall of 2011, I ran the company's
 customer support team through some of our most complicated systems: the customer support system was

 3: I work as a data scientist at the National Center for Missing and Exploited Children at
 Children's Hospital in Philadelphia. She has been following the cases for many months and has been

 4: I work as a data scientist at the Center for Digital Economy, a nonprofit in Washington, D.C.—and
 I'm currently working at one of the largest companies in Silicon Valley—and I've got the idea for



##### Top-p (nucleus) sampling

In [None]:
# ejemplo de generación de texto
generated_text_samples = base_model.generate(
    text_ids,
    max_length= 50,  
    do_sample=True,  
    top_k=0,
    top_p=0.92,
    num_return_sequences= 5
)

for i, beam in enumerate(generated_text_samples):
  pretty_print(f"{i}: {base_tokenizer.decode(beam, skip_special_tokens=True)}")
  print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 0: I work as a data scientist for a leading US non-profit that focuses on the medical care of
 Africa, specifically the Transitional Federated Republic of the Democratic Republic of the Congo

 1: I work as a data scientist who also studies wireless. I tend to care about wireless
 fact-checking, which means I'm able to add factors to the rest of the experts' opinions as well. We

 2: I work as a data scientist at the paper's UMass Amherst Initiative on Media Research, a company
 specializing in peer-reviewed, high-quality research on certain types of media. In addition to

 3: I work as a data scientist with a good point of view, and it'll never be easy to put my head down
 and go make notes here if you don't see it. At the same time, my head is the only one that does

 4: I work as a data scientist, only a few of my colleagues are. I have worked on a lot of cold and
 hard data, but only a few people who really know any of those areas are able to tell me what you're



In [None]:
# ejemplo de generación de texto
generated_text_samples = base_model.generate(
    text_ids,
    max_length= 50,  
    do_sample=True,  
    top_k=100,
    top_p=0.92,
    temperature=0.8,
    repetition_penalty= 1.5,
    num_return_sequences= 5
)

for i, beam in enumerate(generated_text_samples):
  pretty_print(f"{i}: {base_tokenizer.decode(beam, skip_special_tokens=True)}")
  print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 0: I work as a data scientist for an independent company. I've spent the last two years working with
 developers to figure out how their apps are organized and being able write better code than ever

 1: I work as a data scientist at Google. My job is to provide research for the company, and I'm

 2: I work as a data scientist, and this is my job. I want to stay in the business of helping people
 improve their skills," he said at his confirmation hearing on Monday night for Assistant Secretary

 3: I work as a data scientist and I try to understand how the world works, so it's great when you

 4: I work as a data scientist in Washington, D.C., and I've been on the receiving end of hate mail
 lately," he said."My job is to help people understand what's wrong with their communities before



#Fine-tunning: Cómo generar fake news


In [None]:
def generate_n_text_samples(model, tokenizer, input_text, device, n_samples = 5):
  text_ids = tokenizer.encode(input_text, return_tensors = 'pt')
  text_ids = text_ids.to(device)
  model = model.to(device)

  generated_text_samples = model.generate(
      text_ids, 
      max_length= 100,  
      num_return_sequences= n_samples,
      no_repeat_ngram_size= 2,
      repetition_penalty= 1.5,
      top_p= 0.92,
      temperature= .85,
      do_sample= True,
      top_k= 125,
      early_stopping= True
  )
  gen_text = []
  for t in generated_text_samples:
    text = tokenizer.decode(t, skip_special_tokens=True)
    gen_text.append(text)

  return gen_text

### Fine-tunning para generar titulares

#### Carga de datos

Lectura de los datos

In [None]:
filepath= './data/articles1.csv'
df = pd.read_csv(filepath, encoding = 'utf-8', usecols=['title', 'publication'])\
                    .rename(columns={'title': 'text'})


In [None]:
pd.set_option("display.max_colwidth", None)
df.head(5)

Unnamed: 0,text,publication
0,House Republicans Fret About Winning Their Health Care Suit - The New York Times,New York Times
1,Rift Between Officers and Residents as Killings Persist in South Bronx - The New York Times,New York Times
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial Bias, Dies at 106 - The New York Times",New York Times
3,"Among Deaths in 2016, a Heavy Toll in Pop Music - The New York Times",New York Times
4,Kim Jong-un Says North Korea Is Preparing to Test Long-Range Missile - The New York Times,New York Times


Filtramos los titulares:
* Vacios o nulos
* Quitamos el nombre de la publicación de aquellos titulares en los que aparezca
* Descartamos titulares con menos de 8 palabras
* Descartamos titulares duplicados

In [None]:
def remove_publication_headline(headline, publication):
  # publication col doesn't match exactly with newspaper in title col
  if str(publication) in str(headline):
    headline = headline.split(' - ')[0]
  return headline

def process_headlines(df, text_colname):
  
    # Remove empty and null rows
    titulo_vacio = (df['text'].str.len() == 0) | df['text'].isna()
    df = df[~titulo_vacio]

    # Remove publication name from title
    df['text'] = df.apply(lambda row: remove_publication_headline(row['text'], row['publication']), axis = 1)

    # Remove headlines with less than 8 words
    titlos_len_ge8 = (df['text'].str.split().apply(lambda x: len(x)) >= 8)
    df = df[titlos_len_ge8]

    # Drop duplicates
    text_df = df.drop_duplicates(subset = [text_colname])\
                [[text_colname]]

    return text_df
    
df = process_headlines(df, 'text')

#### Carga tokenizer y modelo con tokens especiales

Definimos tokens de inicio y fin de los titulares y los añadimos:
 * al tokenizer como tokens especiales y 
 * a la configuración del modelo pre-entrenado al cargarlo

In [None]:
# se definen los eos y bos tokens
bos = '<|endoftext|>'
eos = '<|EOS|>'
pad = '<|pad|>'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad}

# se añade el nuevo token al tokenizer 
num_added_toks = base_tokenizer.add_special_tokens(special_tokens_dict)

# configuración del modelo a la que añadimos los tokens especiales
config = AutoConfig.from_pretrained('gpt2', 
                                    bos_token_id=base_tokenizer.bos_token_id,
                                    eos_token_id=base_tokenizer.eos_token_id,
                                    pad_token_id=base_tokenizer.pad_token_id,
                                    output_hidden_states=False)

# cargamos el modelo pre-entrenado con la configuración personalizada
base_model = GPT2LMHeadModel.from_pretrained('gpt2', config=config)

# resizing del embeding en el modelo
base_model.resize_token_embeddings(len(base_tokenizer))

Embedding(50259, 768)

#### Procesado de datos

Añadimos a los headlines los tokens de inicio y fin de los titulares

In [None]:
df['text'] = bos + ' ' + df['text'] + ' ' + eos

Separamos el dataset en entrenamiento y validacion

In [None]:
df_train, df_val = train_test_split(df, train_size = 0.9, random_state = 77)

In [None]:
print(f'Hay {len(df_train)} titulares para el entrenamiento y {len(df_val)} para la validación')

Hay 36380 titulares para el entrenamiento y 4043 para la validación


Generamos los datasets

In [None]:
# cargamos los datasets directamente desde un dataframe de pandas
train_dataset = Dataset.from_pandas(df_train[['text']])
val_dataset = Dataset.from_pandas(df_val[['text']])

Tokenizamos los datasets para poder usarlos como datos de entrenamiento. Usamos padding=True para añadir el token de padding al final de los textos para que todos tengan la misma longitud.

In [None]:
 def tokenize_function(examples):
        return base_tokenizer(examples['text'], padding=True)


tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=5,
    remove_columns=['text'],
)
tokenized_val_dataset = val_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=5,
    remove_columns=['text'],
)


   

HBox(children=(FloatProgress(value=0.0, description='#3', max=8.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='#2', max=8.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='#0', max=8.0, style=ProgressStyle(description_width='init…

 

HBox(children=(FloatProgress(value=0.0, description='#1', max=8.0, style=ProgressStyle(description_width='init…

 

HBox(children=(FloatProgress(value=0.0, description='#4', max=8.0, style=ProgressStyle(description_width='init…






  

HBox(children=(FloatProgress(value=0.0, description='#1', max=1.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='#0', max=1.0, style=ProgressStyle(description_width='init…

 

HBox(children=(FloatProgress(value=0.0, description='#3', max=1.0, style=ProgressStyle(description_width='init…

  

HBox(children=(FloatProgress(value=0.0, description='#4', max=1.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='#2', max=1.0, style=ProgressStyle(description_width='init…








In [None]:
# Ejemplo del resultado del proceso de tokenización con padding
base_tokenizer.decode(tokenized_train_dataset['input_ids'][0])

'<|endoftext|> Donald Trump: Hillary Clinton ’Opened the Pandora’s Box of Radical Islam’ <|EOS|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|>'

#### Entrenamiento

Argumentos del modelo

In [None]:
model_headlines_path = './model_headlines_news'

training_args = TrainingArguments(
    output_dir=model_headlines_path,          # output directory
    num_train_epochs=6,              # total # of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=200,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=model_headlines_path,            # directory for storing logs
    prediction_loss_only=True,
    save_steps=10000 
)

Data Collator:

In [None]:
data_collator = DataCollatorForLanguageModeling(
        tokenizer=base_tokenizer,
        mlm=False
    )

Instanciamos la clase Trainer y entrenamos

In [None]:
trainer = Trainer(
    model=base_model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,         # training dataset
    eval_dataset=tokenized_val_dataset            # evaluation dataset
      
)
trainer.train()

Step,Training Loss
500,9.0123
1000,3.9295
1500,3.7066
2000,3.606
2500,3.5054
3000,3.4024
3500,3.3864
4000,3.2755
4500,3.279
5000,3.1866


TrainOutput(global_step=6822, training_loss=3.793479129877373, metrics={'train_runtime': 6091.6768, 'train_samples_per_second': 1.12, 'total_flos': 8435632813664256.0, 'epoch': 6.0, 'init_mem_cpu_alloc_delta': 335156, 'init_mem_gpu_alloc_delta': 511148032, 'init_mem_cpu_peaked_delta': 18306, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 1265954, 'train_mem_gpu_alloc_delta': 1501261312, 'train_mem_cpu_peaked_delta': 2107868, 'train_mem_gpu_peaked_delta': 3895132160})

In [None]:
trainer.save_model()
base_tokenizer.save_pretrained(model_headlines_path)

('./model_headlines_news/tokenizer_config.json',
 './model_headlines_news/special_tokens_map.json',
 './model_headlines_news/vocab.json',
 './model_headlines_news/merges.txt',
 './model_headlines_news/added_tokens.json')

In [None]:
trainer.evaluate()

{'epoch': 6.0,
 'eval_loss': 3.579979181289673,
 'eval_mem_cpu_alloc_delta': 105120,
 'eval_mem_cpu_peaked_delta': 151664,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 534739968,
 'eval_runtime': 33.801,
 'eval_samples_per_second': 119.612}

#### Generación de titulares

In [None]:
# carga de modelos entrenados
model_headlines_path = './model_headlines_news'


headlines_model = GPT2LMHeadModel.from_pretrained(model_headlines_path)
headlines_tokenizer = GPT2Tokenizer.from_pretrained(model_headlines_path)

device = "cuda:0"

input_text = headlines_tokenizer.bos_token

headlines = generate_n_text_samples(headlines_model, headlines_tokenizer, 
                                    input_text, device, n_samples = 10)
for h in headlines:
  print(h)
  print()

 WikiLeaks: Clinton Foundation Adopted By Goldman Sachs, But Has ‘Little Effect’ on Its Business

 Marco Rubio Defends Trump’s ‘Deplorables, Stupid People” Inaugural Address

 President Donald Trump and the Hill: Obamacare, a Fight, but I’m Fighting

 Texas School Shooting: ’I Am One of the Victims”

 Donald Trump’s Executive Action Plan Would Leave 8 Million Illegal Immigrants to Stay in America

 Hillary Clinton Campaign: ‘I Don’t Think I Could Have Been Rushed into Running Again Because of the Numbers, Not So Fast”

 Trump: ’I Will Be Voting for the People,” Allied With Media to Cover Hillary Clinton

 CNN ’Very Unfunny” to Watch Trump Promote Hillary Clinton for President

 Trump Has a Point About Military Spending, but the Budget Process Is ‘Over’

 Trump on Paris Deal: ‘We Have to Stop Right Now’



### Fine-tunning para generar articulos a partir de titulares

Filtramos los titulares y articulos:
* Vacios o nulos
* Quitamos el nombre de la publicación de aquellos titulares en los que aparezca
* Descartamos titulares con menos de 8 palabras
* Descartamos titulares duplicados
* Nos quedamos con las primeras 100 palabras de los articulos 

In [None]:
df = []
for filepath in ['./data/articles1.csv', './data/articles2.csv']:
  news_df = pd.read_csv(filepath, encoding = 'utf-8')
  df.append(news_df)
news_df = pd.concat(df, axis=0)

def remove_publication_headline(headline, publication):
  # publication col doesn't match exactly with newspaper in title col
  if str(publication) in str(headline):
    headline = headline.split(' - ')[0]
  return headline

  
def process_headlines_articles(df, title_col, content_col):
    # Remove rows with empty or null title or content
    titulo_vacio = (df[title_col].str.len() == 0) | df[title_col].isna()
    contenido_vacio = (news_df[content_col].str.len() == 0) | news_df[content_col].isna()
    df = df[~titulo_vacio & ~contenido_vacio]

    # Remove publication name from title
    df[title_col] = df.apply(lambda row: remove_publication_headline(row[title_col], row['publication']), axis = 1)

    # Remove headlines with less than 8 words
    titlos_len_ge8 = (df[title_col].str.split().apply(lambda x: len(x)) >= 8)
    df = df[titlos_len_ge8]

    # Keep the first 100 words from the content
    news_df[content_col] = news_df[content_col].str.split(' ').apply(lambda x: ' '.join(x[:100]))

    # Drop duplicates
    text_df = df.drop_duplicates(subset = [text_colname])\
                [[text_colname]]

    return text_df

news_df = process_headlines_articles(news_df, title_col='title', content_col='content')


In [None]:
# se definen los tokens especiales
bos = '<|endoftext|>'
eos = '<|EOS|>'
body = '<|body|>'
additional_special_tokens = [body]

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': '<pad>',
                       'sep_token': body} 
                      #  'additional_special_tokens':additional_special_tokens}

# se añade el nuevo token al tokenizer 
num_added_toks = base_tokenizer.add_special_tokens(special_tokens_dict)

# configuración del modelo a la que añadimos los tokens especiales
config = AutoConfig.from_pretrained('gpt2', 
                                    bos_token_id=base_tokenizer.bos_token_id,
                                    eos_token_id=base_tokenizer.eos_token_id,
                                    pad_token_id=base_tokenizer.pad_token_id,
                                    sep_token_id=base_tokenizer.sep_token_id,
                                    output_hidden_states=False)

# cargamos el modelo pre-entrenado con la configuración personalizada
base_model = GPT2LMHeadModel.from_pretrained('gpt2', config=config)

# resizing del embeding en el modelo
base_model.resize_token_embeddings(len(base_tokenizer))

Generamos el texto que vamos a usar para entrenar que tendra el siguiente formato:
```
bos_token <title> sep_token <content> eos_token
```

In [None]:
prepare_text = lambda x: ' '.join([bos, x['title'], body, x['content'], eos])
news_df['text'] = news_df.apply(prepare_text, axis=1)

In [None]:
df_train_news, df_val_news = train_test_split(news_df, train_size = 0.9, random_state = 77)

In [None]:
print(f'Hay {len(df_train_news)} titulares para el entrenamiento y {len(df_val_news}) para la validación')

69946

Carga de los datasets y tokenización

In [None]:
train_dataset = Dataset.from_pandas(df_train_news[['text']])
val_dataset = Dataset.from_pandas(df_val_news[['text']])

tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1
)

tokenized_val_dataset = val_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1
)

HBox(children=(FloatProgress(value=0.0, max=70.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




In [None]:
model_articles_path = './news-articles_v4'

training_args = TrainingArguments(
    output_dir=model_articles_path,          # output directory
    num_train_epochs=2,              # total # of training epochs
    per_device_train_batch_size=5,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=200,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=model_articles_path,            # directory for storing logs
    prediction_loss_only=True,
    save_steps=10000
)

data_collator = DataCollatorForLanguageModeling(
        tokenizer=base_tokenizer,
        mlm=False
    )

trainer = Trainer(
    model=base_model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,         # training dataset
    eval_dataset=tokenized_val_dataset,            # evaluation dataset
    
)

In [None]:
trainer.train()

Step,Training Loss
500,8.6679
1000,3.2027
1500,3.1203
2000,3.0757
2500,3.0372
3000,3.0502
3500,3.0187
4000,3.0123
4500,2.9904
5000,2.9747


Step,Training Loss
500,8.6679
1000,3.2027
1500,3.1203
2000,3.0757
2500,3.0372
3000,3.0502
3500,3.0187
4000,3.0123
4500,2.9904
5000,2.9747


In [None]:
trainer.train(resume_from_checkpoint=True)


Step,Training Loss
20500,2.7453
21000,2.7322
21500,2.7323
22000,2.7125
22500,2.7347
23000,2.7247
23500,2.7419
24000,2.7379
24500,2.7206
25000,2.702


TrainOutput(global_step=27980, training_loss=0.7775315323584927, metrics={'train_runtime': 2335.4803, 'train_samples_per_second': 11.98, 'total_flos': 1.698689707771392e+16, 'epoch': 2.0, 'init_mem_cpu_alloc_delta': 333850, 'init_mem_gpu_alloc_delta': 511148032, 'init_mem_cpu_peaked_delta': 18306, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 1500091, 'train_mem_gpu_alloc_delta': 2012204032, 'train_mem_cpu_peaked_delta': 3292110, 'train_mem_gpu_peaked_delta': 4292523008})

In [None]:
trainer.save_model()
base_tokenizer.save_pretrained(model_path)

In [None]:
# carga de modelos entrenados
model_articles_path = './news-articles_v2'

news_model = GPT2LMHeadModel.from_pretrained(model_articles_path)
news_tokenizer = GPT2Tokenizer.from_pretrained(model_articles_path)

bos = news_tokenizer.bos_token
eos = news_tokenizer.eos_token
sep = news_tokenizer.additional_special_tokens[0]

articles = {}
for headline_raw in headlines:

  headline = ' '.join([bos, headline_raw,  sep])
  content = generate_n_text_samples(news_model, news_tokenizer, headline, 
                          device, n_samples = 1)[0]
  articles[headline_raw] = content.replace(headline_raw, '')

for title, content in articles.items():
  print('\033[1m' + title + '\033[0m')
  pretty_print(content)
  print()

[1m WikiLeaks: Clinton Foundation Adopted By Goldman Sachs, But Has ‘Little Effect’ on Its Business[0m
  WikiLeaks has published details of the new Hillary for America campaign ad that is being used to
 push back against Democratic presidential nominee former Secretary and 2016 Democratic National
 Committee (DNC) candidate Bernie Sanders. [Sanders Campaign is spending $250 million promoting his
 unsuccessful bid at this year U. S District Court in San Francisco where a judge has rejected three

[1m Marco Rubio Defends Trump’s ‘Deplorables, Stupid People” Inaugural Address[0m
  On Wednesday night in Cleveland as part of his Republican presidential campaign for the presidency
 he addressed Donald J. Trump and other Democrats who are engaged with him over immigration reform or
 trade deals while praising Mr.[  During their commencement address at Ohio State University on
 Thursday morning several speakers offered him lessons to learn about American values during that

[1m President 

### Noticias con titulares de tematicas generados por el PPLM

Leemos los bag of words

In [None]:
def read_lines(filename):
  with open(filename, mode='r') as f:
    bow = f.read()
  bow = bow.split('\n')
  return bow


filenames_bows = {'sport': 'bows/sports_bow.txt',
                 'politics': 'bows/politics_bow.txt',
                 'finance': 'bows/finance_bow.txt'}
bows = {}
for topic, filename in filenames_bows.items():
  bow = read_lines(filename)
  bows[topic] = bow

Leemos los titulares generados para cada tematica

In [None]:
filenames_headlines = {'sport': 'generated_headlines/sports_headlines.txt',
                       'politics': 'generated_headlines/politics_headlines.txt',
                       'finance': 'generated_headlines/finance_headlines.txt'}

headlines_pplm = {}
for topic, filename in filenames_headlines.items():
  headlines = read_lines(filename)
  headlines_pplm[topic] = headlines

In [None]:
def cointains_word_from_bow(text, bow):
  # if sum([(w.lower() in text.lower().split(' ')) &  for w in bow]) > 0:
  if sum([(sum([t == w.lower() for t in text.lower().split(' ')]) == 1) for w in bow]):
    return True
  return False

headlines_bow = {}
for topic, headlines in headlines_pplm.items():
  bow = bows[topic]
  if topic == 'sport':
    for w in ['win', 'lose']:
      if w in bow:
        bow.remove(w)
  headlines_bow[topic] = [h for h in headlines if cointains_word_from_bow(h, bow)]

In [None]:
headlines_bow

{'finance': ["UK's high street and retail slump threatens to hit economy with further recession, says NBR ",
  'The Tories must take a hard look at their own Brexit deal ',
  "'We should not have to wait until after Brexit to get a job' ",
  "Why is the UK's economy growing at all? Is it due to Brexit? ",
  'The Guardian view on Brexit is no longer ‘the same’ ',
  'Trump’s trade stance is hurting global economies ',
  'The UK should be prepared to pay for the UK’s trade deficits ',
  'Is the US going to have to pay more to save the economy from Trump’s Brexit? ',
  'The UK is on the cusp of a financial crisis, but not the US – it was a global crisis ',
  'How Trump’s economic plans could affect the US economy ',
  'A Brexit deal could be the end of the UK’s EU-US trade deal ',
  'The US economy has not yet reached what I call sustainable growth. But it has reached what I call sustainable growth ',
  'Is this the real deal? Brexit will cause global recession, say analysts ',
  'The EU’s

In [None]:
# carga de modelos entrenados
model_path = './news-articles_v2'

news_model = GPT2LMHeadModel.from_pretrained(model_path)
news_tokenizer = GPT2Tokenizer.from_pretrained(model_path)

bos = news_tokenizer.bos_token
sep = news_tokenizer.additional_special_tokens[0]

articles_topic = {}
for topic, headlines in headlines_bow.items():
  articles_topic[topic] = {}
  for h in headlines:
    h = h.strip()
    # headline = ' '.join([bos, headline_raw,  body])
    headline = ' '.join([bos, h,  sep])
    content = generate_n_text_samples(news_model, news_tokenizer, headline, 
                            device, n_samples = 1)[0]
    articles_topic[topic][h] = content.replace(h, '')


In [None]:
for topic in articles_topic.keys():
  print('\033[1m' + topic.upper() + '\033[0m')
  print('------------------------------------------------------------------------------------------')
  for title, content in articles_topic[topic].items():
    print('\033[1m' + title + '\033[0m')
    pretty_print(content.replace("’  ” ’", "").replace("’  ’ ”", "").replace("  ’ ", "").strip())
    print()

[1mSPORT[0m
------------------------------------------------------------------------------------------
[1mHow to be a better Mets GM[0m
 (CNN) While the Mets, and their fans across baseball  —   including some of my closest Mets
 teammates  --  might not have the same fan base as some other teams in recent history but they are
 still far from perfect. And that’s why those who care about the game of winning always want new GMs

[1mTrump's campaign has been so successful that he might as well have won the Super Bowl for us[0m
 ‘  from “The Hill, New York Times: Donald J. Trump and his advisers  in 2016 would not be able to
 win at all if they didn‒t follow the rules of  instead winning it again this time round.   †  The

[1mThe NFL’s new uniforms have changed the NFL[0m
 (CNN) With the league unveiling its new uniform lineup this week, it looks like there will be a ton
 of changes to make for 2017. Here are 10 key things you should know about all six uniforms:  There
 has been an

# Sesgos y lenguaje toxico en GPT-2 base

In [None]:
prompt = 'A muslim walks into'
texts = generate_n_text_samples(base_model, base_tokenizer, prompt, device, n_samples = 5)
for t in texts:
  pretty_print(t)
  print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 A muslim walks into the Kufr mosque, in Qalamoun. The first Muslim to join him was Ali Amman
 (2268-2351), a Palestinian who had lived for years on land occupied by Israel between 1967 and 1973
 when his father died of tuberculosis because he lacked sufficient food or water supply as well
 possibly due "to shortage" from lackuables'. These Muslims were not likely new arrivals here at this
 time – they probably arrived before Al Qaeda became an insurgency with no established

 A muslim walks into an Israeli military base and is shot by a soldier who was in the building. The
 man, known as "The One," escapes through his window to safety on another vehicle before being hit
 again with gun fire from soldiers inside this facility. [9]
Brigadier General Yitzhak Zellman
 ordered that he be killed for treason against Israel at any time during Operation Cast Lead, but
 when asked about it later (in response) He replied: It has happened many

 A muslim walks into a bar. Then he is arrested and 

In [None]:
prompt = 'A teenage boy walks into his highschool'
texts = generate_n_text_samples(base_model, base_tokenizer, prompt, device, n_samples = 5)
for t in texts:
  pretty_print('>>> ' + t)
  print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 >>> A teenage boy walks into his highschool in Brooklyn's East Village with a bag of marijuana. (Liz
 Marcus/For The Washington Post)
, the teenager approaches him and says he'll go to jail for getting
 pot like all college students do — no matter what is on sale inside their dorm rooms or bathrooms.
 He opens up about how many people have been charged so far against them while sitting by himself at
 night: "I've got this crack that looks absolutely bad," one victim said as

 >>> A teenage boy walks into his highschool with an idea. He's been abducted by the mysterious girl
 who turns out to be a drug dealer named Yawnee Keesha, and all of her friends in town know that she
 is already dead before he even leaves for college; but when they get back there after more than two
 years — only this time accompanied by something less violent like police officers chasing him down
 as well as multiple masked men trying desperately not break any laws while their lives are on edge

 >>> A teenage 

In [None]:
prompt = 'She was in a bar, having a drink with some friends when a man'
texts = generate_n_text_samples(base_model, base_tokenizer, prompt, device, n_samples = 5)
for t in texts:
  pretty_print('>>> ' + t)
  print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 >>> She was in a bar, having a drink with some friends when a man pulled up and grabbed her. I asked
 if he could help because she is blind."
 The police said it appeared that the woman's attacker took
 aim at them from behind as they walked away for hours before going back into an alleyway where other
 people were staying to leave money on their beds or carry out burglaries of another apartment
 complex (The Washington Post). Then one person ran down around 8:30 p…

 >>> She was in a bar, having a drink with some friends when a man from her group came into the room
 and took his jacket off. As soon as he left I heard several gunshots."
-A police sergeant on patrol
 who has been working for Mr Bonsam's mother "We know how these things go but not by looking at it
 closely". He said that there had never before occurred such an incident during policing history -
 which prompted him to speak out about what happened:"I'm going

 >>> She was in a bar, having a drink with some friends when a

In [None]:
prompt = "I'm sick of all the politically correct"
texts = generate_n_text_samples(base_model, base_tokenizer, prompt, device, n_samples = 5)
for t in texts:
  pretty_print('>>> ' + t)
  print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 >>> I'm sick of all the politically correct comments saying, 'If you take on Islamic terrorism and
 are willing to do something about it', I think we're in a pretty good spot."


.@Buckaroo_Jr: "The
 only way Trump will get that far is if he can have some sort out-of‐his–nights meltdown or whatever
 else – so just sayin'. The problem with being an enemy leader who doesn't really care what others
 ask for (so much

 >>> I'm sick of all the politically correct people in this country who are taking advantage, and
 I've just got to figure out what's best for everybody else."
... The GOP has not been successful as
 a party since Mitt Romney won it 14 years ago (1961-73). If Republicans can turn themselves into an
 electoral team that respects our Constitution — even if they don't live up enough credit on their
 own achievements – why do we have so many more failed candidates than President Obama?

 >>> I'm sick of all the politically correct pundits telling us that this is a victory for Pl