In [1]:
# !pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 torchaudio===0.8.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html

In [2]:
# Install transformers
# !pip install transformers

In [3]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [4]:
# Load tokenizer 
## pip install sentencepiece
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

In [5]:
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
model

PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(512, 1024)
      (layers): ModuleList(
        (0-15): 16 x PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_nor

## Abstractive Summarization

In [21]:
text = """
Bob works at a large financial firm as a Data Scientist. Bob and his team all use Python and regularly collaborate 
with each other on certain projects. However, since this financial firm is quite large, they all have numerous 
individual projects they are working on as well. Because of this, there needs to be a universal way to separate 
these projects from each other to ensure they run on any computer with Python installed. This is where virtual 
environments come into play. You can think of a virtual environment as a specific copy of Python in your computer 
that you can specify yourself. This copy can be any version of Python with any packages installed. 
Using virtual environments ensures that there are certain barriers between projects. 
These barriers in place make sure that anyone can run your version of Python regardless of what is on their computer.
"""

In [22]:
# Create tokens - number representation of our text
tokens = tokenizer(text, truncation=True, padding="longest", return_tensors="pt")

In [23]:
# Input tokens
tokens['input_ids']

tensor([[ 4605,   659,   134,   114,   423,   748,  1419,   130,   114,  2331,
         25990,   107,  4605,   111,   169,   320,   149,   207, 11994,   111,
          2440,  8713,   122,   276,   176,   124,   878,   844,   107,   611,
           108,   381,   136,   748,  1419,   117,   708,   423,   108,   157,
           149,   133,  1866,   819,   844,   157,   127,   375,   124,   130,
           210,   107,  2110,   113,   136,   108,   186,   397,   112,   129,
           114,  6161,   230,   112,  1910,   219,   844,   135,   276,   176,
           112,   615,   157,   550,   124,   189,   958,   122, 11994,  1939,
           107,   182,   117,   241,  3263,  4285,   331,   190,   462,   107,
           226,   137,   311,   113,   114,  3263,   849,   130,   114,   739,
          1809,   113, 11994,   115,   128,   958,   120,   119,   137,  7938,
           681,   107,   182,  1809,   137,   129,   189,   824,   113, 11994,
           122,   189,  3633,  1939,   107,  3270,  

In [24]:
tokens['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [25]:
{**tokens}  ## unpacking

{'input_ids': tensor([[ 4605,   659,   134,   114,   423,   748,  1419,   130,   114,  2331,
          25990,   107,  4605,   111,   169,   320,   149,   207, 11994,   111,
           2440,  8713,   122,   276,   176,   124,   878,   844,   107,   611,
            108,   381,   136,   748,  1419,   117,   708,   423,   108,   157,
            149,   133,  1866,   819,   844,   157,   127,   375,   124,   130,
            210,   107,  2110,   113,   136,   108,   186,   397,   112,   129,
            114,  6161,   230,   112,  1910,   219,   844,   135,   276,   176,
            112,   615,   157,   550,   124,   189,   958,   122, 11994,  1939,
            107,   182,   117,   241,  3263,  4285,   331,   190,   462,   107,
            226,   137,   311,   113,   114,  3263,   849,   130,   114,   739,
           1809,   113, 11994,   115,   128,   958,   120,   119,   137,  7938,
            681,   107,   182,  1809,   137,   129,   189,   824,   113, 11994,
            122,   189,  36

In [26]:
# Summarize 
summary = model.generate(**tokens)

In [27]:
summary

tensor([[    0,   222,   136,  3844,   108,   145,   127,   313,   112,   286,
           134,   199,   112,   421,   114,  3263,   849,   118, 11994,   107,
             1]])

In [28]:
# Output summary tokens
summary[0]

tensor([    0,   222,   136,  3844,   108,   145,   127,   313,   112,   286,
          134,   199,   112,   421,   114,  3263,   849,   118, 11994,   107,
            1])

In [29]:
# Decode summary
print(tokenizer.decode(summary[0]))

<pad>In this lesson, we are going to look at how to create a virtual environment for Python.</s>
