# Transformer block from scratch 

Encoder

In [None]:
def FeedForwardBlock(d_model, d_ff, dropout, dropout_shared_axes, mode, activation):    
    dropout_middle = tl.Dropout(rate=dropout,
                                shared_axes=dropout_shared_axes, 
                                mode=mode)
  
    dropout_final = tl.Dropout(rate=dropout, 
                               shared_axes=dropout_shared_axes, 
                               mode=mode)

    
    ff_block = [ 
        # trax Layer normalization 
        tl.LayerNorm(),
        # trax Dense layer using `d_ff`
        tl.Dense(d_ff),
        # activation() layer - you need to call (use parentheses) this func!
        activation(),
        # dropout middle layer
        dropout_middle,
        # trax Dense layer using `d_model`
        tl.Dense(d_model),
        # dropout final layer
        dropout_final
    ]
        
    return ff_block

In [None]:
def EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes,
                  mode, ff_activation, FeedForwardBlock=FeedForwardBlock):

    
    # Attention block
    attention = tl.Attention( 
        # Use dimension of the model
        d_feature=d_model,
        # Set it equal to number of attention heads
        n_heads=n_heads,
        # Set it equal `dropout`
        dropout=dropout,
        # Set it equal `mode`
        mode=mode
    )
    
    # Call the function `FeedForwardBlock` (implemented before) and pass in the parameters
    feed_forward = FeedForwardBlock( 
        d_model,
        d_ff,
        dropout,
        dropout_shared_axes,
        mode,
        ff_activation 
    )
    
    # Dropout block
    dropout_ = tl.Dropout( 
        # set it equal to `dropout`
        rate=dropout,
        # set it equal to the axes on which to share dropout mask
        shared_axes=dropout_shared_axes,
        # set it equal to `mode`
        mode=mode
    )
    
    encoder_block = [ 
        # add `Residual` layer
        tl.Residual(
            # add norm layer
            tl.LayerNorm(),
            # add attention
            attention,
            # add dropout
            dropout_,
        ),
        # add another `Residual` layer
        tl.Residual(
            # add feed forward
            feed_forward,
        ),
    ]
        
    return encoder_block

In [None]:
def TransformerEncoder(vocab_size=32000,
                       n_classes=10,
                       d_model=512,
                       d_ff=2048,
                       n_layers=6,
                       n_heads=8,
                       dropout=0.1,
                       dropout_shared_axes=None,
                       max_len=2048,
                       mode='train',
                       ff_activation=tl.Relu,
                      EncoderBlock=EncoderBlock):
  
    
    positional_encoder = [
        tl.Embedding(vocab_size, d_model),
        tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode),
        tl.PositionalEncoding(max_len=max_len)
    ]
    
    
    # Use the function `EncoderBlock` (implemented above) and pass in the parameters over `n_layers`
    encoder_blocks = [EncoderBlock(d_model, d_ff, n_heads, dropout,
                        dropout_shared_axes, mode, ff_activation) for _ in range(n_layers)]
    # Assemble and return the model.
    return tl.Serial(
        # Encode
        tl.Branch(
            # Use `positional_encoder`
            positional_encoder,
            # Use trax padding mask
            tl.PaddingMask(),
        ),
        # Use `encoder_blocks`
        encoder_blocks,
        # Use select layer
        tl.Select([0], n_in=2),
        # Use trax layer normalization
        tl.LayerNorm(),
        # Map to output categories.
        # Use trax mean. set axis to 1
        tl.Mean(axis=1),
        # Use trax Dense using `n_classes`
        tl.Dense(n_classes),
        # Use trax log softmax
        tl.LogSoftmax(),
    )


decoder

In [None]:
def CausalAttention(d_feature, 
                    n_heads, 
                    compute_attention_heads_closure=compute_attention_heads_closure,
                    dot_product_self_attention=dot_product_self_attention,
                    compute_attention_output_closure=compute_attention_output_closure,
                    mode='train'):

    
    assert d_feature % n_heads == 0
    d_head = d_feature // n_heads

 
    
    # HINT: The second argument to tl.Fn() is an uncalled function (without the parentheses)
    # Since you are dealing with closures you might need to call the outer 
    # function with the correct parameters to get the actual uncalled function.
    # use 'compute_attention_heads_closure'
    ComputeAttentionHeads = tl.Fn('AttnHeads', compute_attention_heads_closure(n_heads, d_head), n_out=1)
        

    return tl.Serial(
        tl.Branch( # creates three towers for one input, takes activations and creates queries keys and values
            [tl.Dense(d_feature), ComputeAttentionHeads], # queries
            [tl.Dense(d_feature), ComputeAttentionHeads], # keys
            [tl.Dense(d_feature), ComputeAttentionHeads], # values
   
        
        ),
        
        tl.Fn('DotProductAttn',dot_product_self_attention, n_out=1), # takes QKV
        # HINT: The second argument to tl.Fn() is an uncalled function
        # Since you are dealing with closures you might need to call the outer 
        # function with the correct parameters to get the actual uncalled function.
        # 'compute_attention_output_closure'
        tl.Fn('AttnOutput', compute_attention_output_closure(n_heads, d_head), n_out=1), # to allow for parallel
        tl.Dense(d_feature)
    )



In [None]:
def DecoderBlock(d_model, d_ff, n_heads,
                 dropout, mode, ff_activation):

    
    
    # Create masked multi-head attention block using CausalAttention function
    causal_attention = CausalAttention( 
                        d_model,
                        n_heads=n_heads,
                        mode=mode
                        )

    # Create feed-forward block (list) with two dense layers with dropout and input normalized
    feed_forward = [ 
        # Normalize layer inputs
        tl.LayerNorm(),
        # Add first feed forward (dense) layer (don't forget to set the correct value for n_units)
        tl.Dense(d_ff),
        # Add activation function passed in as a parameter (you need to call it!)
        ff_activation(), # Generally ReLU
        # Add dropout with rate and mode specified (i.e., don't use dropout during evaluation)
        tl.Dropout(rate=dropout, mode=mode),
        # Add second feed forward layer (don't forget to set the correct value for n_units)
        tl.Dense(d_model),
        # Add dropout with rate and mode specified (i.e., don't use dropout during evaluation)
        tl.Dropout(rate=dropout, mode=mode)

    ]

    # Add list of two Residual blocks: the attention with normalization and dropout and feed-forward blocks
    return [
      tl.Residual(
          # Normalize layer input
          tl.LayerNorm(),
          # Add causal attention block previously defined (without parentheses)
          causal_attention,
          # Add dropout with rate and mode specified
          tl.Dropout(rate=dropout, mode=mode)
        ),
      tl.Residual(
          # Add feed forward block (without parentheses)
          feed_forward
        ),
      ]


Transormer language model

In [None]:
def TransformerLM(vocab_size=33300,
                  d_model=512,
                  d_ff=2048,
                  n_layers=6,
                  n_heads=8,
                  dropout=0.1,
                  max_len=4096,
                  mode='train',
                  ff_activation=tl.Relu):
    
    
    # Embedding inputs and positional encoder
    positional_encoder = [ 
        # Add embedding layer of dimension (vocab_size, d_model)
        tl.Embedding(vocab_size, d_model),
        # Use dropout with rate and mode specified
        tl.Dropout(rate=dropout, mode=mode),
        # Add positional encoding layer with maximum input length and mode specified
        tl.PositionalEncoding(max_len=max_len, mode=mode)]

    # Create stack (list) of decoder blocks with n_layers with necessary parameters
    decoder_blocks = [ 
                DecoderBlock(d_model, d_ff, n_heads,
                    dropout, mode, ff_activation) for _ in range(n_layers)]

    # Create the complete model as written in the figure
    return tl.Serial(
        # Use teacher forcing (feed output of previous step to current step)
        tl.ShiftRight(mode=mode), # Specify the mode!
        # Add positional encoder
        positional_encoder,
        # Add decoder blocks
        decoder_blocks,
        # Normalize layer
        tl.LayerNorm(),

        # Add dense layer of vocab_size (since need to select a word to translate to)
        # (a.k.a., logits layer. Note: activation already set by ff_activation)
        tl.Dense(vocab_size),
        # Get probabilities with Logsoftmax
        tl.LogSoftmax()
    )



# Question Answering with BERT and HuggingFace


In practice, we rarely train a transformer model from scratch.  Transformers tend to be very large, so they take time, money, and lots of data to train fully. Instead, we'll start with a pre-trained model and fine-tune it with your dataset if you need to.

[Hugging Face](https://huggingface.co/) (🤗) is the best resource for pre-trained transformers. Their open-source libraries simplify downloading and using transformer models like BERT, T5, and GPT-2. And the best part, you can use them alongside either TensorFlow, PyTorch and Flax. 

In this notebook, we'll use the DistilBERT model for question answering. 



In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m66.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.2 tokenizers-0.13.2 transformers-4.26.1


## Pipelines



In [None]:
from transformers import pipeline

In [None]:
# The task "question-answering" will return a QuestionAnsweringPipeline object
question_answerer = pipeline(task="question-answering", model="distilbert-base-cased-distilled-squad")

Downloading (…)lve/main/config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/261M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

### Context




In [None]:

context = """
Modern football has evolved significantly in terms of tactics over the years. There has been a shift towards more possession-based football,
 as well as the use of pressing and counter-pressing.

One of the most significant tactical innovations in modern football is the use of a back-three or back-five formation. This formation allows
 for more flexibility in attack, as the wing-backs can push up and create overloads in wide areas. At the same time, the three or five
  defenders provide greater defensive solidity.

Another key tactical trend in modern football is the emphasis on pressing and counter-pressing. This involves players aggressively pressing
 the opposition when they have the ball, in order to win it back quickly and launch a counter-attack. This requires a high level of fitness
  and discipline from the players, as well as good teamwork and coordination.

In addition, modern football also places a lot of importance on possession-based football. This involves keeping the ball for extended
 periods of time and playing short, intricate passes to move up the field. The goal is to tire out the opposition and create openings
  for attacking opportunities.

To defend as an offensive team, it is recommended to use the high pressure tactic which refers to a defensive strategy in
 which a team attempts to regain possession of the ball quickly by putting intense pressure on the opposing team's players when they are in 
 possession of the ball, often in their own half of the pitch. This is typically done by the team's forwards and midfielders, who work 
 together to cut off passing options, close down space, and force mistakes.The objective of high pressure tactic is to disrupt the opposing
  team's rhythm, limit their time on the ball, and force turnovers in dangerous areas of the pitch, which can lead to scoring opportunities.
  However, high pressure tactics require a high level of fitness, coordination, and discipline from the players, as it can be physically
   demanding and requires a lot of teamwork to execute effectively.

teams with defensive identity, the italian football, generally use counter-attacking tactic to attack which is a tactical strategy used by teams
 that prefer to defend deep and quickly transition to attack when they win the ball back.The idea behind the counter-attack tactic is to
  catch the opposition off-guard and take advantage of the open spaces that are left behind when the opposing team commits players forward.
   When a team wins the ball back in their own half, the defenders or midfielders quickly play a long ball or pass to their forwards, who
    will look to sprint forward and exploit the space that the opposition has left behind.The counter-attack tactic requires fast and 
    skillful forwards who can create chances out of nothing, as well as defenders and midfielders who are disciplined and good at intercepting 
    the ball. 
Overall, modern football tactics are constantly evolving and changing, as coaches and players continue to look for new and innovative 
ways to win games.    
"""

In [None]:
result = question_answerer(question="The modern tactic in football?", context=context)
print(result['answer'])

There has been a shift towards more possession-based football


In [None]:
result = question_answerer(question="How to defend as an offensive team?", context=context)
print(result['answer'])

high pressure tactic


In [None]:
result = question_answerer(question="To what the high pressure tactic refers?", context=context)
print(result['answer'])

a defensive strategy


In [None]:
result = question_answerer(question="What is the goal of high pressure tactic?", context=context)
print(result['answer'])

to disrupt the opposing
  team's rhythm


In [None]:
result = question_answerer(question="How do teams with defensive identity attack?", context=context)
print(result['answer'])

counter-attacking tactic


In [None]:
result = question_answerer(question="What is the idea behind the counter-attacking tactic?", context=context)
print(result['answer'])

to
  catch the opposition off-guard


You can also pass multiple questions to your pipeline within a list so that you can ask:

*   "Where is tea native to?"
*   "When was tea discovered?"
*   "What is the species name for tea?"

at the same time, and your `question-answerer` will return all the answers. 

In [None]:
questions = ["The modern tactic in football?",
             "How to defend as an offensive team?",
             "To what the high pressure tactic refers?",
             "What is the goal of high pressure tactic?",
             "How do teams with defensive identity attack?",
             "What is the idea behind the counter-attacking tactic?"]

results = question_answerer(question=questions, context=context)

for q, r in zip(questions, results):
    print(q, "\n>> " + r['answer'])

The modern tactic in football? 
>> There has been a shift towards more possession-based football
How to defend as an offensive team? 
>> high pressure tactic
To what the high pressure tactic refers? 
>> a defensive strategy
What is the goal of high pressure tactic? 
>> to disrupt the opposing
  team's rhythm
How do teams with defensive identity attack? 
>> counter-attacking tactic
What is the idea behind the counter-attacking tactic? 
>> to
  catch the opposition off-guard
