# Install and Import Lib

!pip install transformers

In [1]:
import nltk
from nltk.corpus import stopwords
import string
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import textwrap
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adnane\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adnane\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Prétraitement du text

In [2]:
# fonction pour faire la tokenisation
def tok_func(data):
  data = nltk.word_tokenize(data)
  return data

In [3]:
# fonction pour la suppression des stopWords
def stpword_func(data):
  stopwords_en = set(stopwords.words('english'))
  data = [w for w in data if w not in stopwords_en]
  return data

In [4]:
# fonction pour supprimer la ponctuation
def ponc_fonc(data):
  ponc = string.punctuation
  data = [w for w in data if w not in ponc]
  return data

In [5]:
# fonction pour transformer le text en minuscule
def min_fonc(data):
  data = [w.lower() for w in data]
  return data

In [6]:
# fonction pour faire steaming
def stem_func(data):
  stem = PorterStemmer()
  data = [stem.stem(w) for w in data]
  return data

In [7]:
# fonction pour clean_text
def clean_func(data):
  data = tok_func(data)
  data = stpword_func(data)
  data = ponc_fonc(data)
  data = min_fonc(data)
  data = stem_func(data)
  data = " ".join(data)
  return data

# Q&A model

In [None]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [None]:
def answer_question(question, answer_text):
    # Encode la question et le texte de réponse en utilisant le tokenizer
    input_ids = tokenizer.encode(question, answer_text)

    # Trouve l'index du token de séparation [SEP]
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # Calcule le nombre de tokens dans la première et deuxième séquence
    num_seg_a = sep_index + 1
    num_seg_b = len(input_ids) - num_seg_a

    # Crée les ids de segment pour chaque token
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # Vérifie que la longueur des ids de segment est la même que celle des input_ids
    assert len(segment_ids) == len(input_ids)

    # Effectue la prédiction en utilisant le modèle
    outputs = model(torch.tensor([input_ids]), 
                    token_type_ids=torch.tensor([segment_ids]), 
                    return_dict=True)

    # Récupère les scores de début et de fin de réponse
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # Trouve l'index du token de début et de fin de réponse avec les scores max
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Convertit les ids de tokens en tokens réels
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Concatène les tokens pour former la réponse
    answer = tokens[answer_start]
    for i in range(answer_start + 1, answer_end + 1):
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        else:
            answer += ' ' + tokens[i]

    # Affiche la réponse
    print('Réponse : "' + answer + '"')


# Tester le modéle

## Answer_text 

In [None]:
wrapper = textwrap.TextWrapper(width=80) 
bert_abstract = "Musk was born in Pretoria, South Africa, and briefly attended at the University of Pretoria before moving to Canada at age 18, acquiring citizenship through his Canadian-born mother. Two years later, he matriculated at Queen's University and transferred to the University of Pennsylvania, where he received bachelor's degrees in economics and physics. He moved to California in 1995 to attend Stanford University. After two days, he dropped out and, with his brother Kimbal, co-founded the online city guide software company Zip2. In 1999, Zip2 was acquired by Compaq for $307 million and Musk co-founded X.com, a direct bank. X.com merged with Confinity in 2000 to form PayPal, which eBay acquired for $1.5 billion in 2002. Musk received an EB-5 investor green card in 1997, which led to his U.S. citizenship in 2002.[8]"
bert_abstract = clean_func(bert_abstract)
print(wrapper.fill(bert_abstract))

## Question_text

In [None]:
question = "where musk is born?"
question = clean_func(question)
answer_question(question, bert_abstract)