In [171]:
import spacy

# Modelle installieren und Text verarbeiten
#python3 -m spacy download en_core_web_sm
#python3 -m spacy download de_core_news_sm

# Ähnlichkeitsmetrik anwenden (similarity metric)

In [172]:
#python3 -m spacy download en_core_web_lg 
nlp = spacy.load("en_core_web_lg")
doc = nlp("Mango and ananas are similar. Pizza and dogs aren't.")
mango = doc[0]
ananas = doc[2]
pizza = doc[6]
dogs = doc[8]
print("mango vs. ananas", mango.similarity(ananas))
print("pizza vs. dogs", pizza.similarity(dogs))
print(mango.has_vector, ananas.has_vector, pizza.has_vector, dogs.has_vector)
print(mango.vector)

mango vs. ananas 0.524490237236023
pizza vs. dogs 0.0301766786724329
True True True True
[ 1.6968e-01 -1.1406e+00  4.2686e-01 -1.1662e-01  3.1422e+00 -1.7256e+00
  8.5684e-01 -1.8575e+00 -3.0863e+00  9.7959e-01 -1.8544e+00 -7.3673e-02
  1.7760e+00  1.4921e+00 -1.4639e+00 -4.2563e+00  3.9893e-01 -1.4180e+00
  2.1238e+00 -2.6686e+00 -6.1075e-01  3.6439e+00  8.2714e-01  1.4939e-01
  5.1963e-01  5.0651e-02 -1.0765e+00 -1.1264e+00 -1.1290e+00 -4.1344e+00
  1.0955e+00 -8.9723e-01 -1.2512e+00  2.8614e-01  1.9688e+00 -9.2044e-01
 -1.5579e+00 -2.3330e+00 -2.5618e+00 -3.9875e+00 -4.6655e-01  1.8044e-01
 -9.9386e-01  2.3489e+00  2.4804e+00  4.1089e-01  5.7436e-01  2.2858e+00
  1.3014e+00 -1.5357e+00  2.9267e-01 -1.4692e+00 -1.4985e+00 -5.0963e-01
  3.2040e+00 -5.7984e-01 -1.5260e+00 -1.1930e+00  1.5599e+00 -1.3757e+00
  2.0683e-01  1.4810e+00  1.5973e+00 -7.8030e-01 -8.6978e-01  2.1282e-01
 -9.3034e-01 -3.2458e+00  5.2884e+00 -7.2662e-01 -4.1247e+00  3.5894e-01
 -1.3810e+00  3.0313e-01  6.7794e-0

# Fortgeschrittenes NLP: Sätze parsen

In [173]:
from spacy import displacy

doc = nlp("This is a sentence.")
displacy.render(doc, style="dep")

In [174]:
import pandas as pd

train_data = pd.read_csv('data/NLP_Question_Answering/QA_train_data.csv')
test_data = pd.read_csv('data/NLP_Question_Answering/QA_test_data.csv')

# Vorhersagelogik

In [175]:
def predict_answer(question: str) -> str:
    """
    Predicts an answer to a given question
    :param question: question string
    :return: answer string or None if insufficient confidence
    """
    best_match = None
    best_similarity = 0.0

    # Durchlaufe alle Trainingsfragen und finde die beste Übereinstimmung
    for index, row in train_data.iterrows():
        similarity = nlp(question).similarity(nlp(row['Question']))

        if similarity > best_similarity:
            best_similarity = similarity
            best_match = row['Answer']


    return best_match


# Vorhersagen

In [176]:
# Predict
predicted = test_data['Question'].apply(lambda q: predict_answer(q))
actual = test_data['Answer']

test_data['Predicted'] = test_data['Question'].apply(predict_answer)
test_data[['Question', 'Answer', 'Predicted']].head(100)

Unnamed: 0,Question,Answer,Predicted
0,From which country is the film A Beautiful Mind?,America,America
1,Where was the film A Beautiful Mind made?,America,2001
2,In which country was the film A Beautiful Mind...,America,2001
3,Where was the movie A Beautiful Mind made?,America,2001
4,From which country is the film?,America,"Russell Crowe, along with Ed Harris, Jennifer ..."
5,From which country is this film?,America,"Russell Crowe, along with Ed Harris, Jennifer ..."
6,When was the film A beautiful Mind released?,2001,2001
7,Of which genre is the film A Beautiful Mind?,biographical drama,biographical drama
8,Which kind of film is A Beautiful Mind?,biographical drama,biographical drama
9,Which type of movie is A Beautiful Mind?,biographical drama,biographical drama


# Evaluate

In [177]:
def evaluate_result(predicted: str, actual: str) -> int:
    """
    Evaluates individual string answer string pair
    :param predicted: answer predicted by predictor
    :param actual: correct answer according to data set
    return: +1 if predicted answer is correct,
    • if no answer is predicted (None),
    -1 if answer is wrong
    """
    if predicted is None:
        return 0
    elif predicted == actual:
        return 1
    else:
        return -1


def evaluate_results(predicted: pd.Series, actual: pd.Series) -> float:
    """
    Evaluates a series of answer pairs :param predicted: Series of predicted answers
    param actual: Series of correct answers
    :return: Float value between -1 (worst) and +1 (best)
    """
    sum = 0
    for index, value in predicted.items():
        eval = evaluate_result(value, actual[index])
        sum += eval
        result = sum / predicted.size
    return result

# Ergebnisse evaluieren

In [178]:
score = evaluate_results(predicted, actual)
print(f"Score: {score}")

Score: 0.22727272727272727
