# Assignment 2B: Feature computation

The purpose of this notebook is to perform the computation of features. 

Note that some features might be expensive, so you don't want to keep re-computing them. Instead, aim for writing a set of relatively simple feature extractors, each computing one or multiple features, and save their output to separate files. Then, load the pre-computed features from multiple files in the learning step (in the [ranking notebook](2_Ranking.ipynb)).

## Feature extractors

Example feature extractors.

## Feature computation

Computes features for document-query pairs and saves them to a file.

Specifically, we will save features to a JSON file, using a nested map structure, with queries on the first level, documents on the second level, and individual features on the third level. 

```python
  features = {
      'query_i': {
          'doc_j': {
              'feature_1': 0,  # value of feature_1 for (query_i, doc_j) pair
              'feature_2': 0,  # value of feature_2 for (query_i, doc_j) pair
              ...
          }
          ...
      }
      ...
  }
```

**Note**: The set of documents for a query (for which you want to compute features) should be a combination of the documents for which you have relevance labels and the top-100 documents retrieved in first-pass retrieval.
You can then decide in the learning part if/how you want to deal with class imbalance.

In [1]:
import urllib
import requests
import json
import math
from pprint import pprint
import pickle

import pandas as pd

from IPython.display import clear_output # Using IPython.display.clear_output to clear the output of a cell.

API = "http://gustav1.ux.uis.no:5002"

MAIN_INDEX = "clueweb12b"
ANCHORS_INDEX = "clueweb12b_anchors"

In [2]:
def load_queries(query_file):
    queries = {}
    with open(query_file, "r") as fin:
        for line in fin.readlines():
            qid, query = line.strip().split(" ", 1)
            queries[qid] = query
    return queries

# Functions for the API.
def tokenize_query(indexname, query):
    url = "/".join([API, indexname, "_analyze"]) + "?" \
          + urllib.parse.urlencode({"text": query})
    response = requests.get(url).text
    try:
        r = json.loads(response)
        return [t["token"] for t in r["tokens"]]
    except:
        print("Error in analyze query: \n", response)
        return query.split()

def search(indexname, query, field, size=10):
    url = "/".join([API, indexname, "_search"]) + "?" \
          + urllib.parse.urlencode({"q": query, "df": field, "size": size})
    response = requests.get(url).text
    
    return json.loads(response)

def exists(indexname, doc_id):
    url = "/".join([API, indexname, doc_id, "_exists"])
    response = requests.get(url).text
    return json.loads(response)['exists']

def analyze_query(indexname, query):
    url = "/".join([API, indexname, "_analyze"]) + "?" \
          + urllib.parse.urlencode({"text": query})
    response = requests.get(url).text
    r = json.loads(response)
    return [t["token"] for t in r["tokens"]]

def term_vectors(indexname, doc_id, term_statistics=False):
    ret = {}    
    url = "/".join([API, indexname, doc_id, "_termvectors"]) + "?" \
          + urllib.parse.urlencode({"term_statistics": str(term_statistics).lower()})
    response = requests.get(url).text
    try:
        ret = json.loads(response)
    except:
        print("Failed to json-decode this response:\n{}".format(response))
        
    return ret

In [3]:
def feature_bm25(tokens, doc, field, index):
    """Feature: BM25 retrieval score on a given field."""
#     print('BM25:', tokens, doc, field, index)
    k1 = 1.2
    b  = 0.75

    score = 0
    try:
        term_vector = term_vectors(index, doc, term_statistics=True)['term_vectors'][field]
    except:
        return 0
    
#     term_vector = term_vectors(index, doc, term_statistics=True)
#     if 'term_vectors' not in term_vector.keys() or field not in term_vectors(index, doc, term_statistics=True)['term_vectors']:
#         return 0
    
#     term_vector = term_vector['term_vectors'][field]
    
    avgdl = term_vector['field_statistics']['sum_ttf']/term_vector['field_statistics']['doc_count']
    doc_len = sum([stats['term_freq'] for term, stats in term_vector['terms'].items()])

    for term in tokens:
        if term in term_vector['terms'].keys():
            idf = math.log(term_vector['field_statistics']['doc_count']/term_vector['terms'][term]['doc_freq'])
            f_td = term_vector['terms'][term]['term_freq']

            term_score = idf*((f_td*(k1+1))/(f_td*(1-b+b*(doc_len/avgdl))))
            
            score = score + term_score
            
    return score
    

def feature_lm(tokens, doc, field, index):
    """Feature: LM retrieval score on a given field."""
#     print('LM:', tokens, doc, field, index)
    lmbda = 0.8
    score = 0
    
    try:
        term_vector = term_vectors(index, doc, term_statistics=True)['term_vectors'][field]
    except:
        return 0
    
#     term_vector = term_vectors(index, doc, term_statistics=True)    
#     if 'term_vectors' not in term_vector.keys() or field not in term_vectors(index, doc, term_statistics=True)['term_vectors']:
#         return 0    
#     term_vector = term_vector['term_vectors'][field]
    
    doc_len = sum([stats['term_freq'] for term, stats in term_vector['terms'].items()])

    for term in tokens:
        f_tq = tokens.count(term)
        term_score = 0
        
        if term in term_vector['terms'].keys():
            p_td = term_vector['terms'][term]['term_freq'] / doc_len
            p_tC = term_vector['terms'][term]['ttf'] / term_vector['field_statistics']['sum_ttf']
            
            term_score = math.log(((1-lmbda)*p_td) + (lmbda*p_tC) + 1) * f_tq
            
        score = score + term_score
    return score

In [None]:
pagerank = pd.read_csv("data/pagerank.docNameOrder", sep = " ", names = ['doc_id', 'rank'])

In [None]:
# Building the feature dictionary
def get_features(queries):
    
    fields = ["content", "title", "anchors"]
    features = {}
    
    for q, query in queries.items():
        features[q] = {}

        print("Working with {} - {}".format(q, query))    

        tokens = tokenize_query(MAIN_INDEX, query)
        print("Query to API: ", ' '.join(tokens))

    #   Build a list of documents from all the fields
        print("Making list of docs", end = " - ")
        
        docs = []
        for field in fields:
            
            index = MAIN_INDEX
            if(field == 'anchors'):
                index = ANCHORS_INDEX
                
            try:
                search_res = search(index, ' '.join(tokens), field, size=100)['hits']['hits']
            except:
                continue
            
            for doc in search_res:
                if exists(MAIN_INDEX, doc['_id']) and exists(ANCHORS_INDEX, doc['_id']):
                    docs.append(doc['_id'])
             
            print(field, 'done', end = " - ")

        docs = list(set(docs))
        print("\nNo. of documents:", len(docs))
        
        i = 0
        percent_done = 0
        for d in docs:            
            features[q][d] = {}

#           Query Features
            features[q][d]['q_len'] = len(query.split())
            features[q][d]['q_token_len'] = len(tokens)
        
        

#           Query Document Features
            for field in fields:
                index = MAIN_INDEX
                if(field == 'anchors'):
                    index = ANCHORS_INDEX
                
                features[q][d]['bm25_' + field] = feature_bm25(tokens, d, field, index)
                features[q][d]['lm_'+ field] = feature_lm(tokens, d, field, index)
                
                
                
#           Document Features
#           PageRank
            features[q][d]['doc_pagerank'] = pagerank[pagerank['doc_id'] == d].iloc[0]['rank']
            
            url = "http://gustav1.ux.uis.no:5002/"+MAIN_INDEX+"/"+d+"/_get"
            response = requests.get(url).text
            response = json.loads(response)
            try:
                features[q][d]['doc_main_indx_length'] = response['_source']['length']
            except:
                features[q][d]['doc_main_indx_length'] = 0
                
                
            url = "http://gustav1.ux.uis.no:5002/"+ANCHORS_INDEX+"/"+d+"/_get"
            response = requests.get(url).text
            response = json.loads(response)
            try:
                features[q][d]['doc_anchors_indx_length'] = len(list(response['_source'].values())[0].split())
            except:
                features[q][d]['doc_anchors_indx_length'] = 0
                
                
#             print(features[q][d]['doc_main_indx_length'], features[q][d]['doc_anchors_indx_length'])
    
#           Showing percentage done, each '-' represents 1%
            i = i + 1
            if((i/len(docs))*100 >= (percent_done + 1)):
                percent_done = percent_done + 1
                print("-",end="")
                
                
#         pprint(features[q])
#         print("-------------------------------------------------------------------")
        print("\n")
    clear_output()
    print("Features Collected.")
    return features;

In [6]:
train_features = get_features(load_queries("data/queries.txt"))
pickle.dump(train_features, open("data/train_features.p", "wb"))

Features Collected.


In [7]:
test_features = get_features(load_queries("data/queries2.txt"))
pickle.dump(test_features, open("data/test_features.p", "wb"))

Features Collected.
