In [55]:
import csv
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
import re
from collections import defaultdict


In [20]:
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

In [32]:
lemmatizer = WordNetLemmatizer()
with open("stopwords.txt", "r") as f:
    stopwords = set(map(lambda w:w.strip(), f.readlines()))

In [None]:
sentences = nltk.sent_tokenize(textsample)  
words = nltk.word_tokenize(textsample)  

In [51]:
pos_tag_map = {"J": "a", "V": "v", "R":"r"}
reg_exp = re.compile("[^a-z0-9]+")

def parser(passage):
    passage = passage.lower()
    sentences = sent_tokenize(passage)
    result = []
    for s in sentences:
        text = reg_exp.sub(" ", s)
        words = [word for word in nltk.word_tokenize(text)]
        words = [lemmatizer.lemmatize(words[i], pos=pos_tag_map.get(nltk.pos_tag([words[i]])[0][1][0], "n")) for i in range(len(words))]
        words = [word for word in words if word not in stopwords]
        result += words
    return result


In [101]:
passage = dict()
with open("collection.tsv", encoding='utf-8', mode="r") as f:
    r = csv.reader(f, delimiter="\t")
    for i, row in enumerate(r):
        pid, passage = row[0], row[1]
        passage[pid] = passage #parser(passage)

In [87]:
def read_query(path):
    query_dict = dict()
    with open(path, encoding='utf-8', mode="r") as f:
        r = csv.reader(f, delimiter="\t")
        for i, row in enumerate(r):
            qid, query = row[0], row[1]
            query_dict[qid] = query #parser(query)
    return query_dict

train_query = read_query("queries.train.tsv")
print(len(train_query))
dev_query = read_query("queries.dev.tsv")
print(len(dev_query))
eval_query = read_query("queries.eval.tsv")
print(len(eval_query))

808731
101093
101092


In [74]:
def read_qrels(path):
    query_to_passage_dict = defaultdict(set)
    passage_to_query_dict = defaultdict(set)
    with open(path, encoding='utf-8', mode="r") as f:
        r = csv.reader(f, delimiter="\t")
        for row in r:
            qid, pid = row[0], row[2]
            query_to_passage_dict[qid].add(pid)
            passage_to_query_dict[pid].add(qid)
    return query_to_passage_dict, passage_to_query_dict
train_qrels_qtp, train_qrels_ptq = read_qrels("qrels.train.tsv")
print(len(train_qrels_qtp), len(train_qrels_ptq))
dev_qrels_qtp, dev_qrels_ptq = read_qrels("qrels.dev.tsv")
print(len(dev_qrels_qtp), len(dev_qrels_ptq))

502939 516472
55578 59096


In [86]:
for pid in train_qrels_ptq:
    if len(train_qrels_ptq[pid]) > 2:
        print(pid, train_qrels_ptq[pid])
        break

14520 {'605763', '982399', '605764'}


In [113]:
print(train_query["605763"])
print(train_query["982399"])
print(train_query["605764"])

what county is exeter new hampshire im
where is exeter nh
what county is exeter nh


In [121]:
# query term recall
def passage_qtr(pid):
    qd = train_qrels_ptq[pid]
    passage_terms = set(parser(passage[pid]))
    print(passage_terms)
    qtr = defaultdict(int)
    for qid in qd:
        query_terms = set(parser(train_query[qid]))
        print(query_terms)
        for term in passage_terms:
            if term in query_terms:
                qtr[term] += 1/len(qd)
    return qtr

passage_qtr("14520")

{'1997', 'population', '306', 'hampshire', 'new', 'county', '14', 'rockingham', 'office', 'seat', 'move', 's', 'neighbor', 'town', 'census', 'state', 'exeter', '2010', 'brentwood', 'united'}
{'new', 'hampshire', 'county', 'im', 'exeter'}
{'exeter', 'nh'}
{'exeter', 'county', 'nh'}


defaultdict(int,
            {'county': 0.6666666666666666,
             'exeter': 1.0,
             'hampshire': 0.3333333333333333,
             'new': 0.3333333333333333})

In [122]:
def query_tr(qid):
    dq = train_qrels_qtp[qid]
    query_terms = set(parser(train_query[qid]))
    print(query_terms)
    tr = defaultdict(int)
    for pid in dq:
        passage_terms = set(parser(passage[pid]))
        print(passage_terms)
        for term in query_terms:
            if term in passage_terms:
                tr[term] += 1/len(dq)
    return tr
query_tr("982399")

{'exeter', 'nh'}
{'1997', 'population', '306', 'hampshire', 'new', 'county', '14', 'rockingham', 'office', 'seat', 'move', 's', 'neighbor', 'town', 'census', 'state', 'exeter', '2010', 'brentwood', 'united'}


defaultdict(int, {'exeter': 1.0})

In [129]:
for qid in train_qrels_qtp:
    if len(train_qrels_qtp[qid]) > 2:
        print(qid, train_qrels_qtp[qid])
        break

88585 {'17276', '17272', '17277'}


In [137]:
print(train_query["17276"])
print(passage["17276"])
print(passage["17272"])
print(passage["17277"])

amount of protein allowed daily for low protein diet
Swollen ankles and feet. Swollen ankles and swollen feet are common and are often caused by fluid retention, or oedema. The cause of the swelling can range from an injury to medical conditions. Seek medical advice if you are concerned about swollen feet or ankles.
Painless swelling of the feet and ankles is a common problem, especially among older people. Abnormal buildup of fluid in the ankles, feet, and legs can cause swelling. This fluid buildup and swelling is called edema.
Injury or surgery involving the leg, ankle, or foot can also cause swelling. Swelling may also occur after pelvic surgery, especially for cancer. Long airplane flights or car rides, as well as standing for long periods of time, often lead to some swelling in the feet and ankles.
{'1068701', '587682', '505939', '88585', '587837'}


In [131]:
query_tr("88585")

{'ankle', 'cause', 'foot', 'swollen'}
{'fluid', 'oedema', 'medical', 'condition', 'concerned', 'advice', 'injury', 'common', 'foot', 'cause', 'seek', 'ankle', 'range', 'retention', 'swollen', 'swell'}
{'fluid', 'leg', 'buildup', 'painless', 'especially', 'edema', 'common', 'foot', 'abnormal', 'cause', 'problem', 'old', 'ankle', 'people', 'call', 'swell'}
{'involve', 'swell', 'flight', 'leg', 'especially', 'cause', 'foot', 'occur', 'ride', 'ankle', 'pelvic', 'stand', 'injury', 'period', 'lead', 'cancer', 'airplane', 'car', 'long', 'time', 'surgery'}


defaultdict(int,
            {'ankle': 1.0,
             'cause': 1.0,
             'foot': 1.0,
             'swollen': 0.3333333333333333})