In [10]:
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd

In [2]:
tokenizer = AutoTokenizer.from_pretrained('facebook/contriever-msmarco')
model = AutoModel.from_pretrained('facebook/contriever-msmarco')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [3]:
sentences = [
    "Where was Marie Curie born?",
    "Maria Sklodowska, later known as Marie Curie, was born on November 7, 1867.",
    "Born in Paris on 15 May 1859, Pierre Curie was the son of Eugène Curie, a doctor of French Catholic origin from Alsace."
]

In [4]:
# Apply tokenizer

inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

In [6]:
# Compute token embeddings

outputs = model(**inputs)

In [27]:
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.1066,  0.0164,  0.0547,  ..., -0.0065, -0.0631, -0.0280],
         [ 0.0192, -0.0602,  0.0502,  ...,  0.1185, -0.0204, -0.0111],
         [ 0.1103, -0.0207,  0.0044,  ...,  0.0599, -0.0432,  0.0183],
         ...,
         [ 0.0305, -0.2994, -0.0895,  ...,  0.2829,  0.0329,  0.0951],
         [ 0.0298, -0.3350, -0.0862,  ...,  0.2983,  0.0351,  0.0803],
         [ 0.0299, -0.2928, -0.0951,  ...,  0.2991,  0.0265,  0.0623]],

        [[ 0.0799,  0.0201,  0.0418,  ...,  0.0752,  0.0130,  0.0336],
         [-0.0250,  0.0173,  0.0594,  ...,  0.1173, -0.1175,  0.0543],
         [ 0.0280,  0.0259, -0.0916,  ..., -0.1142, -0.0608,  0.1254],
         ...,
         [ 0.0315, -0.1623, -0.0734,  ...,  0.1735, -0.0453,  0.0775],
         [ 0.0674, -0.1745, -0.0764,  ...,  0.1819, -0.0478,  0.0687],
         [ 0.0397, -0.1430, -0.0823,  ...,  0.1684, -0.0414,  0.0401]],

        [[ 0.0364, -0.0647,  0.0651,  ...,  0.0903,  

In [7]:
# Mean pooling

def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings

In [8]:
embeddings = mean_pooling(outputs[0], inputs['attention_mask'])

In [9]:
embeddings

tensor([[ 0.0161,  0.0055,  0.0199,  ...,  0.0372, -0.0831, -0.0112],
        [ 0.0037,  0.0346, -0.0131,  ...,  0.0247, -0.1021, -0.0303],
        [-0.0146, -0.0235, -0.0338,  ...,  0.0277, -0.0025, -0.0092]],
       grad_fn=<DivBackward0>)

In [25]:
inputs['input_ids']

tensor([[  101,  2073,  2001,  5032, 12731,  7373,  2141,  1029,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  3814, 15315,  4135,  3527,  9333,  2912,  1010,  2101,  2124,
          2004,  5032, 12731,  7373,  1010,  2001,  2141,  2006,  2281,  1021,
          1010,  7517,  1012,   102,     0,     0,     0,     0,     0,     0],
        [  101,  2141,  1999,  3000,  2006,  2321,  2089,  8165,  1010,  5578,
         12731,  7373,  2001,  1996,  2365,  1997,  8207, 12731,  7373,  1010,
          1037,  3460,  1997,  2413,  3234,  4761,  2013, 24922,  1012,   102]])

Work on generated data

In [13]:
data = pd.read_json("all_data.jl", lines=True)

In [28]:
data.head()

Unnamed: 0,gpt-3,gpt-3_paragraphs,end2end,fine_tuned_data
0,[{'text': 'Title: Finite State Machine Design ...,[{'text': 'Title: The Halting Problem P0 Tex...,[{'text': 'Title: Finite State Machine Design ...,[{'text': 'Title: Finite State Machine Design ...


In [36]:
print(data['gpt-3_paragraphs'][0][1]['text'])

Title: The Halting Problem P1 
 Text: In order to make these notes more useful as a reference, definitions are
highlighted with boldface, and italicization emphasizes pitfalls or other
important points. 
 Question: 
Q. What is the difference between a pitfall and an important point? 
 Answer: 

A. A pitfall is a potential problem or danger that could occur, while an important point is something that is noteworthy or deserves attention.


In [41]:
print(data['fine_tuned_data'][0][1]['text'])

Title: Steps in the Design Process 
 Text: {Steps in the Design Process}

Before we begin exploring designs, let's talk briefly about the general
approach that we take when designing an FSM.  We follow a six-step
process:{-8pt}

{{}{}
{}{}{}
{develop an abstract model}{step-abs}
{specify I/O behavior}{step-io}
{complete the specification}{step-complete}
{choose a state representation}{step-repn}
{calculate logic expressions}{step-logic}
{implement with flip-flops and gates}{step-gates}
}
{-8pt}

In Step {step-abs}, we translate our description in human language
into a model with states and desired behavior.  At this stage, we 
simply try to capture the intent of the description and are not
particularly thorough nor exact.

Step {step-io} begins to formalize the model, starting with its
input and output behavior.  If we eventually plan to develop an
implementation of our FSM as a digital system (which is not the 
only choice, of course!), all input and output
must consist of bits.  Ofte

In [44]:
print(data['end2end'][0][1]['text'])

Title: Steps in the Design Process 
 Text: {Steps in the Design Process}

Before we begin exploring designs, let's talk briefly about the general
approach that we take when designing an FSM.  We follow a six-step
process:{-8pt}

{{}{}
{}{}{}
{develop an abstract model}{step-abs}
{specify I/O behavior}{step-io}
{complete the specification}{step-complete}
{choose a state representation}{step-repn}
{calculate logic expressions}{step-logic}
{implement with flip-flops and gates}{step-gates}
}
{-8pt}

In Step {step-abs}, we translate our description in human language
into a model with states and desired behavior.  At this stage, we 
simply try to capture the intent of the description and are not
particularly thorough nor exact.

Step {step-io} begins to formalize the model, starting with its
input and output behavior.  If we eventually plan to develop an
implementation of our FSM as a digital system (which is not the 
only choice, of course!), all input and output
must consist of bits.  Ofte

In [45]:
data['end2end'][0][1]['text']

"Title: Steps in the Design Process \n Text: {Steps in the Design Process}\n\nBefore we begin exploring designs, let's talk briefly about the general\napproach that we take when designing an FSM.  We follow a six-step\nprocess:{-8pt}\n\n{{}{}\n{}{}{}\n{develop an abstract model}{step-abs}\n{specify I/O behavior}{step-io}\n{complete the specification}{step-complete}\n{choose a state representation}{step-repn}\n{calculate logic expressions}{step-logic}\n{implement with flip-flops and gates}{step-gates}\n}\n{-8pt}\n\nIn Step {step-abs}, we translate our description in human language\ninto a model with states and desired behavior.  At this stage, we \nsimply try to capture the intent of the description and are not\nparticularly thorough nor exact.\n\nStep {step-io} begins to formalize the model, starting with its\ninput and output behavior.  If we eventually plan to develop an\nimplementation of our FSM as a digital system (which is not the \nonly choice, of course!), all input and output\

In [None]:
# need to clean data and organize in a proper dataframe?

In [None]:
# how to verify?