# Information Retrieval

# Indexing a dataset

Basic configuration for indexing.

In [1]:
# basic configuration for indexing
basic_settings = {
  "mappings": {
    "doc": {
      "properties": {
        "filename": {
          "type": "keyword",
          "index": False,
        },
        "path": {
          "type": "keyword",
          "index": False,
        },
        "text": {
          "type": "text",
          "similarity": "boolean",
          "analyzer": "my_analyzer",
          "search_analyzer": "my_analyzer"
        }
      }
    }
  },
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "filter": [
            "stop"
          ],
          "char_filter": [
            "html_strip"
          ],
          "type": "custom",
          "tokenizer": "whitespace"
        }
      }
    }
  }   
}

In [2]:
# done for a set of government documents. change the DOCS_PATH for reusing code for other documents
import os, io
from collections import namedtuple
from elasticsearch import Elasticsearch

ES_HOSTS = ['http://localhost:9200']
DOCS_PATH = 'gov/documents'
INDEX_NAME = 'gov'
DOC_TYPE = 'doc'

def build_gov_index(es_conn, index_name, doc_path, settings):
    Doc = namedtuple('Doc', 'filename path text')
    dataset=[]
    for root, dirs, files in os.walk(DOCS_PATH):
        for doc_path in files:
            
            filename = (root+"/"+doc_path).split('/')[-1]
            fp = io.open(root+"/"+doc_path, 'r', encoding = "ISO-8859-1")
            text = fp.read().strip()
            fp.close()
            dataset.append(Doc(filename = filename, text = text, path = doc_path))

            
            
    if es_conn.indices.exists(index_name):
        es_conn.indices.delete(index = index_name)
        print('index `{}` deleted'.format(index_name))
    es_conn.indices.create(index = index_name, ignore = 400, body = settings)
    print('index `{}` created'.format(index_name))            
    counter_read, counter_idx_failed = 0, 0 # counters

    # retrive & index documents
    for doc in dataset:
        res = es_conn.index(
            index = index_name,
            id = doc.filename,
            doc_type = DOC_TYPE,
            body = doc._asdict())
        counter_read += 1

        if res['result'] != 'created':
            conter_idx_failed += 1
        else:
            print('indexed {} documents'.format(counter_read))

    print('indexed {} docs to index `{}`, failed to index {} docs'.format(
        counter_read,
        index_name,
        counter_idx_failed
    ))
    
    # refresh after indexing
    es_conn.indices.refresh(index=index_name)  



es_conn = Elasticsearch(ES_HOSTS)
build_gov_index(es_conn, INDEX_NAME, DOCS_PATH, basic_settings)

## Search and measure performance

In [10]:
#searching for documents based on a query string and ranking them based on similarity

def search(query_string, es_conn, index_name):
    '''
        searches for query_string with default search algorithm
        input:
            - query_string: a query
            - es_conn: elasticsearch connection
            - index_name: name of index
        output:
            - a generator of tuple (filename, score)

    '''
    res = es_conn.search(index = index_name,
        body = {
            "_source": [ "filename"],
            "query": {
                "query_string": {
                    "query": query_string,
                }
            }
        }
    )
    return res['hits']['hits']

def read_search_write_output(search_fn, query_path, output_file):
    with open(output_file, 'w') as output:
        output.write('QueryId,DocumentNumber,Similarity,Iteration,RunId,Rank\n')  #for your convenience

        # YOUR CODE HERE
        
        f=io.open(query_path,'r',encoding = "ISO-8859-1")
        text=f.read().strip().split("\n")
        i=1
        for query in text:
            result = search_fn(query[2:], es_conn, INDEX_NAME)
            
            count=0
            for r in result:
                if count<10:
                    output.write(query[:2]+','+r['_source']['filename']+',0,0,0,0\n')
                    count=count+1
            i=i+1

                                
                        
                    
                    
        

query_path = 'gov/topics/gov.topics'
output_file = 'output.csv'
read_search_write_output(search, query_path, output_file)

## Improve indexer

In [4]:
# configure settings to define your own analyzer for indexing
q3_settings = {
  "mappings": {
    "doc": {
      "properties": {
        "filename": {
          "type": "keyword",
          "index": False,
        },
        "path": {
          "type": "keyword",
          "index": False,
        },
        "text": {
          "type": "text",
          "similarity": "boolean",
          "analyzer": "my_analyzer",
          "search_analyzer": "my_analyzer"
        }
      }
    }
  },
  "settings": {
    "analysis": {
      "analyzer": {
          "my_analyzer": {
            "filter": [
            "stop","lowercase","porter_stem"
          ],
          "char_filter": [
            "html_strip"
          ],
          "type": "custom",
          "tokenizer": "standard"
        }
      }
    }
  }
}

In [1]:

build_gov_index(es_conn, INDEX_NAME, DOCS_PATH, q3_settings)
read_search_write_output(search, query_path, output_file)

NameError: name 'build_gov_index' is not defined

## Improve search algorithm

*Elasticsearch* also provides multiple configurable scoring algorithms. For this task, you will be asked to find a better similarity module to improve the search performance. Please refer [here](https://www.elastic.co/guide/en/elasticsearch/reference/current/similarity.html) for better understanding of configurable elasticsearch similarity modules.

You can also change the `search` function to improve performance. Please refer [here](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html) for better understanding of Query DSL used in *elasticsearch*.

In [12]:
# define your own analyzer for indexing and searching
q4_settings = {
  "mappings": {
    "doc": {
      "properties": {
        "filename": {
          "type": "keyword",
          "index": False,
        },
        "path": {
          "type": "keyword",
          "index": False,
        },
        "text": {
            # YOUR CODE HERE
            "type": "text",
            "similarity": "BM25",
            "analyzer": "my_analyzer",
            "search_analyzer": "my_analyzer"
        }
      }
    }
  },
  "settings": {
    "analysis": {
      "analyzer": {
            # YOUR CODE HERE
            "my_analyzer": {
            "filter": [
            "stop","lowercase","porter_stem"
          ],
          "char_filter": [
            "html_strip"
          ],
          "type": "custom",
          "tokenizer": "standard"
        }
      }
    }
  }
}


# change search algorithm to improve the search results, the return type should be the same as that of `search` function
def my_search(query_string, es_conn, index_name):
    # YOUR CODE HERE
    res = es_conn.search(index = index_name,
        body = {
            "_source": [ "filename"],
            "query": {
                "query_string": {
                    "query": query_string,
                }
            }
        }
    )
    return res['hits']['hits']



In [1]:
# run this block to generate an output based on q4_settings and my_search defined above.
build_gov_index(es_conn, INDEX_NAME, DOCS_PATH, q4_settings)
read_search_write_output(my_search, query_path, output_file)

NameError: name 'build_gov_index' is not defined

## Find the best combination

Now it's time to explorer the best configuration of indexer and search algorithms. Each combination will yield a different search outcome. 

In [None]:
# find the best combination of indexer configuration and search algorithm to maximise the performance of search result.
best_settings = {
    # YOUR CODE HERE
    "mappings": {
    "doc": {
      "properties": {
        "filename": {
          "type": "keyword",
          "index": False,
        },
        "path": {
          "type": "keyword",
          "index": False,
        },
        "text": {
            # YOUR CODE HERE
            "type": "text",
            "similarity": "BM25",
            "analyzer": "my_analyzer",
            "search_analyzer": "my_analyzer"
        }
      }
    }
  },
  "settings": {
    "analysis": {
      "analyzer": {
            # YOUR CODE HERE
            "my_analyzer": {
            "filter": [
            "stop","lowercase","porter_stem","shingle"
          ],
          "char_filter": [
            "html_strip"
          ],
          "type": "custom",
          "tokenizer": "standard"
        }
      }
    }
  }
}



def best_search(query_string, es_conn, index_name):
    # YOUR CODE HERE
    res = es_conn.search(index = index_name,
        body = {
            "_source": [ "filename"],
            "query": {
                "query_string": {
                    "query": query_string,
                }
            }
        }
    )
    return res['hits']['hits']


In [None]:
# TODO: run this block to generate the output
build_gov_index(es_conn, INDEX_NAME, DOCS_PATH, best_settings)
read_search_write_output(my_search, query_path, output_file)