In [13]:
import matplotlib.pyplot as plt
from numba import jit, cuda 
import pandas as pd
from sklearn.manifold import TSNE
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.decomposition import PCA
import umap.umap_ as umap
import Dataloader as dl

In [15]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

# Loads data and returns a list of sentences as specfies in Dataloader
sentences = dl.loaddata()

# Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("KennethEnevoldsen/dfm-sentence-encoder-large")
model = AutoModel.from_pretrained("KennethEnevoldsen/dfm-sentence-encoder-large")

# Tokenize sentences
encoded_input = tokenizer(
    sentences, padding=True, truncation=True, max_length=128, return_tensors="pt"
)

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])

sentence_embeddings = sentence_embeddings.detach().numpy()

# Perform t-SNE
# tsne = TSNE(n_components=2, random_state=0)
# tsne_results = tsne.fit_transform(sentence_embeddings)

# PCA Prep
pca = PCA(n_components=2)
X_pca = pca.fit_transform(sentence_embeddings)

# Perform UMAP Prep
reducer = umap.UMAP()

X_umap = reducer.fit_transform(sentence_embeddings)

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Index sentences along with their embeddings
sentence_index = {}
for sentence, embedding in zip(sentences, sentence_embeddings):
    sentence_index[sentence] = embedding

def retrieve_answer(question, threshold=0.7):
    # Tokenize the question and compute its embedding
    encoded_question = tokenizer(question, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        question_output = model(**encoded_question)
    question_embedding = mean_pooling(question_output, encoded_question["attention_mask"]).numpy()

    # Reshape the question embedding to 2D array
    question_embedding = np.reshape(question_embedding, (1, -1))

    # Calculate cosine similarity between the question embedding and all sentence embeddings
    similarities = {}
    for sentence, embedding in sentence_index.items():
        # Reshape the sentence embedding to 2D array
        embedding = np.reshape(embedding, (1, -1))
        similarity = cosine_similarity(question_embedding, embedding)[0][0]
        similarities[sentence] = similarity

    # Find the most similar sentence
    most_similar_sentence = max(similarities, key=similarities.get)
    similarity_score = similarities[most_similar_sentence]

    # If similarity score is above the threshold, return the answer associated with the sentence
    if similarity_score >= threshold:
        return most_similar_sentence
    else:
        return "I'm sorry, I don't have an answer to that question."

# Example usage:
question = "Men forsikringsoplysningen, Anja, hvad er det, I tilbyder der?"
answer = retrieve_answer(question)
print(answer)

Some weights of the model checkpoint at KennethEnevoldsen/dfm-sentence-encoder-large were not used when initializing BertModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.transform.LayerNorm.bias', 'lm_head.transform.LayerNorm.weight', 'lm_head.transform.dense.bias', 'lm_head.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


('Men forsikringsoplysningen, Anja, hvad er det, I tilbyder der?', ' Vi er en del af Forsikring- og pensionsselskabernes brancheorganisation, og vi er en rådgivningstjeneste, hvor man kan ringe ind og stille spørgsmål til sine forsikringer. F.eks. hvis man har været ude for et uheld, eller hvis man står over for at købe en ny forsikring.  Men det er faktisk også et sted, hvor man kan få hjælp til at sammenligne priser og tægning på forskellige forsikringer. Vi har et værktøj, der hedder forsikringsguiden.dk, og der kan man simpelthen sammenligne en lang række forsikringer. Og man kan også få hjælp til ligesom at afklare, hvad er det for forsikringer, jeg reelt har behov for. Så vi er ikke et forsikringssatskab, men vi er en tjeneste, som ligesom hjælper dig til at træffe nogle valg.')


In [None]:
import Dataloader as dl

sentences = dl.loaddata()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("KennethEnevoldsen/dfm-sentence-encoder-large")
model = AutoModel.from_pretrained("KennethEnevoldsen/dfm-sentence-encoder-large")

In [None]:
# Plot the results
# plt.scatter(tsne_results[:,0], tsne_results[:,1])
# plt.title('t-SNE of Sentence Embeddings')
# plt.xlabel('Dimension 1')
# plt.ylabel('Dimension 2')
# plt.show()

# Plot PCA
plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Plot of Question-Answer Pairs')
plt.grid(True)
plt.show()

# Plot UMAP
plt.subplot(1, 2, 2)
plt.scatter(X_umap[:, 0], X_umap[:, 1])
plt.title('UMAP plot')

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(np.arange(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_, marker='o', linestyle='-')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Scree Plot')
plt.xticks(np.arange(1, len(pca.explained_variance_ratio_) + 1))
plt.grid(True)
plt.show()

In [None]:
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


In [None]:
# #VERY OUT DATED
# def answer_question(question, context):
#     # Tokenize the input
#     inputs = tokenizer.encode_plus(question, context, return_tensors='pt', max_length=512, truncation=True)

#     # Get the model's predictions
#     outputs = model(**inputs)

#     # Get the start and end scores
#     start_scores, end_scores = outputs.start_logits, outputs.end_logits

#     # Get the start and end positions
#     start_idx = torch.argmax(start_scores)
#     end_idx = torch.argmax(end_scores)

#     # Get the answer
#     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx+1]))

#     return answer

# context = 'insurance is a means of protection from financial loss. It is a form of risk management, primarily used to hedge against the risk of a contingent or uncertain loss. An entity which provides insurance is known as an insurer, an insurance company, an insurance carrier or an underwriter. A person or entity who buys insurance is known as an insured or as a policyholder. The insurance transaction involves the insured assuming a guaranteed and known relatively small loss in the form of payment to the insurer in exchange for the insurer is a promise to compensate the insured in the case of a financial loss. The insured receives a contract, called the insurance policy, which details the conditions and circumstances under which the insurer will compensate the insured. The amount of money charged by the insurer to the policyholder for the coverage set forth in the insurance policy is called the premium. If the insured experiences a loss which is potentially covered by the insurance policy, the insured submits a claim to the insurer for processing by a claims adjuster.'
# question = "What is insurance?"
# print(answer_question(question, context))

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Index sentences along with their embeddings
sentence_index = {}
for sentence, embedding in zip(sentences, sentence_embeddings):
    sentence_index[sentence] = embedding

def retrieve_answer(question, threshold=0.7):
    # Tokenize the question and compute its embedding
    encoded_question = tokenizer(question, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        question_output = model(**encoded_question)
    question_embedding = mean_pooling(question_output, encoded_question["attention_mask"]).numpy()

    # Reshape the question embedding to 2D array
    question_embedding = np.reshape(question_embedding, (1, -1))

    # Calculate cosine similarity between the question embedding and all sentence embeddings
    similarities = {}
    for sentence, embedding in sentence_index.items():
        # Reshape the sentence embedding to 2D array
        embedding = np.reshape(embedding, (1, -1))
        similarity = cosine_similarity(question_embedding, embedding)[0][0]
        similarities[sentence] = similarity

    # Find the most similar sentence
    most_similar_sentence = max(similarities, key=similarities.get)
    similarity_score = similarities[most_similar_sentence]

    # If similarity score is above the threshold, return the answer associated with the sentence
    if similarity_score >= threshold:
        return most_similar_sentence
    else:
        return "I'm sorry, I don't have an answer to that question."

# Example usage:
question = "I tilbyder der, hvad?"
answer = retrieve_answer(question)
print(answer)

In [None]:
import openai

openai.api_key = 'your-api-key'

def ask_gpt3(question, context):
    response = openai.Completion.create(
      engine="text-davinci-002",
      prompt=f"{context}\n{question}",
      temperature=0.5,
      max_tokens=100
    )
    return response.choices[0].text.strip()

context = "The sky is blue during the day because of the way Earth's atmosphere scatters sunlight in all directions. More blue light is scattered than other colors because it travels in smaller, shorter waves. This is known as Rayleigh scattering."

question = "Why is the sky blue?"

print(ask_gpt3(question, context))

In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

def ask_gpt2(question):
    inputs = tokenizer.encode_plus(question, return_tensors="pt")
    outputs = model.generate(
        input_ids=inputs["input_ids"], 
        attention_mask=inputs["attention_mask"],
        max_length=100, 
        num_return_sequences=1, 
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )
    answer = tokenizer.decode(outputs[0])
    return answer

question = "who is isaac newton?"

print(ask_gpt2(question))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


who is isaac newton?

I'm not sure if you can say that.

I'm not sure if you can say that.

I'm not sure if you can say that.

I'm not sure if you can say that.

I'm not sure if you can say that.

I'm not sure if you can say that.

I'm not sure if you can say that.

I'm not sure if you


In [None]:
# from transformers import BertTokenizer, EncoderDecoderModel
# import torch

# # Initialize the tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # Initialize the model
# model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased')

# # Define the training data
# sentences = [(question, answers) for question, answers in sentences]

# # Separate questions and answers
# questions, answers = zip(*sentences)

# # Tokenize the training data
# inputs = tokenizer(list(questions), return_tensors="pt", padding=True, truncation=True)
# outputs = tokenizer(list(answers), return_tensors="pt", padding=True, truncation=True)

# # Define the model's parameters
# model.config.decoder_start_token_id = tokenizer.cls_token_id
# model.config.eos_token_id = tokenizer.sep_token_id
# model.config.pad_token_id = tokenizer.pad_token_id
# model.config.vocab_size = model.config.encoder.vocab_size

# # Train the model
# model.train()
# outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=outputs["input_ids"], labels=outputs["input_ids"])

# # Define the optimizer
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# # Perform backpropagation
# loss = outputs.loss
# loss.backward()

# # Update the weights
# optimizer.step()