In [None]:
import json
import numpy as np 
import pandas as pd
import re
import random
import os
for dirname, _, filenames in os.walk('/kaggle/input/tensorflow2-question-answering/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm_notebook as tqdm
from Levenshtein import ratio as levenshtein_distance

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

from scipy import spatial

In [None]:
#слова и теги которые будем удалять из текста и вопроса

In [None]:
tags = ['is', 'are', 'do', 'does', 'did', 'was', 'were', 'will', 'can', 'the', 'a', 'of', 'in', 'and', 'on', \
         'what', 'where', 'when', 'which', '</P>', '<Table>', '</Table>', '<Tr>', '</Tr>', '<Ul>', '<Ol>', '<Dl>', '</Ul>', '</Ol>', \
             '</Dl>', '<Li>', '<Dd>', '<Dt>', '</Li>', '</Dd>', '</Dt>']

question_words = ['is', 'are', 'do', 'does', 'did', 'was', 'were', 'will', 'can']

In [None]:
#функция для очистки

In [None]:
def clean(x):
    x = x.lower()
    for r in tags:
        x = x.replace(r, '')
    x = re.sub(' +', ' ', x)
    return x

In [None]:
#функция предсказания

In [None]:
def predict(json_data):
    #выбираем кондиратов на длинный ответ
    candidates = [c for c in json_data['long_answer_candidates'] if c['top_level'] == True]
    doc_tokenized = json_data['document_text'].split(' ')
    #берем вопрос
    question = json_data['question_text'].split(' ') 
    #создаем модель tf-idf с н-грамами от 1 до 3 и английскими стоп-словами и обучаем
    tfidf = TfidfVectorizer(ngram_range=(1,3), stop_words=text.ENGLISH_STOP_WORDS.union(["book"]))
    tfidf.fit([json_data['document_text']])
    #преобразуем вопрос в вектор
    q_tfidf = tfidf.transform([json_data['question_text']]).todense()

    #считаем косинусоидальное расстояние и сортируем ответы в зависимости от score
    scores = []
    count = 1
    for i, c in enumerate(candidates):
        s, e = c['start_token'], c['end_token']
        t = ' '.join(doc_tokenized[s:e])
        t_tfidf = tfidf.transform([t]).todense()
        score = 1 - spatial.distance.cosine(q_tfidf, t_tfidf)
        
        if doc_tokenized[s] == '<P>':
            score += 0.25**count
            count += 1
        scores.append(score)      
    ans = (np.array(candidates)[np.argsort(scores)])[-1:].tolist()
    #ds,bhftv
    if np.max(scores) < 0.15:
        long = ['-1:-1']
        ans = [{'start_token': 0, 'end_token': 0}]
    else:
        long = [str(a['start_token']) + ':' + str(a['end_token']) for a in ans]
    if question[0] in question_words:
        short = 'YES'
    else:
        short = ''
            
    return long, short

In [None]:
ids = []
preds = []

with open('/kaggle/input/tensorflow2-question-answering/simplified-nq-test.jsonl', 'r') as json_file:
    for line in tqdm(json_file): 
        json_data = json.loads(line)
        l_ans, s_ans = predict(json_data)
        ids += [str(json_data['example_id']) + '_long']*len(l_ans)
        ids.append(str(json_data['example_id']) + '_short')
        preds += l_ans
        preds.append(s_ans) 
        
df = pd.DataFrame()
df['example_id'] = ids
df['PredictionString'] = preds
sub = df[['example_id', 'PredictionString']].groupby('example_id').agg(lambda x: ' '.join(x) if len(x) > 1 else x).reset_index()
sub.to_csv('submission.csv', index=False)

sub.head(10)