# Search Regression Testing

In [19]:
import abc

class Request(abc.ABC):
    def __init__(self, host, uri):
        self._host = host
        self._uri = uri
        
    def target(self, query, page):
        return self._host + self._uri + "?q=" + query + "&page=%d" % page
        
    def search(self, query, page=1):
        import requests
        
        r = requests.get(self.target(query, page))
        return r.json()
    
    @abc.abstractmethod
    def get_hits(self, query):
        pass
    
class BabbageRequest(Request):
    host = "http://localhost:20000/"
    uri = "search/data"
    def __init__(self):
        super(BabbageRequest, self).__init__(self.host, self.uri)
        
    def target(self, query, page):
        return self._host + self._uri + "?q=" + query + "&page=%d" % page + "&searchTarget=internal"
        
    def get_hits(self, query, page):
        response = self.search(query, page=page)
        hits = []
        for hit in response['result']['results']:
            hits.append(hit.get('uri'))
            
        return hits
        
class SearchRequest(Request):
    host = "http://localhost:5000/"
    uri = "search/ons/content"
    def __init__(self):
        super(SearchRequest, self).__init__(self.host, self.uri)
        
    def get_hits(self, query, page):
        response = self.search(query, page=page)
        hits = []
        for hit in response['results']:
            hits.append(hit.get('uri'))
            
        return hits
    
class ConceptualSearchRequest(SearchRequest):
    uri = "search/conceptual/ons/content"

br = BabbageRequest()
sr = SearchRequest()

In [20]:
import pymongo
import numpy as np

class SearchStats(object):
    db = "local"
    collection = "searchstats"
    def __init__(self):
        self._client = pymongo.MongoClient()
        self._db = self._client.get_database(self.db)
        self._collection = self._db.get_collection(self.collection)
        
        self._docs = []
        
        self._load()
        
    def _load(self):
        if len(self._docs) == 0:
            for doc in self._collection.find():
                self._docs.append(doc)
                
    def __len__(self):
        return len(self._docs)
                
    def __iter__(self):
        for doc in self._docs:
            yield doc
            
    def __getitem__(self, item):
        return self._docs[item]
    
    def group_by_search_term(self):
        grouped = {}
        
        for doc in self._docs:
            term = doc.get("term")
            
            if term not in grouped:
                grouped[term] = []
            grouped[term].append(doc)
            
        return grouped
    
    def judgements(self, max_judgement=4.):
        """
        Groups searchStats by search term
        """
        judgements = {}
        for doc in self._docs:
            rank = doc.get("linkindex") + ((doc.get("pageindex") - 1) * doc.get("pagesize"))
            term = doc.get("term")
            if term not in judgements:
                judgements[term] = {}
                
            url = doc.get('url')
            if url not in judgements[term]:
                judgements[term][url] = {"count": 1, "rank": rank}
            else:
                judgements[term][url]["count"] += 1
                
        # Normalise
        for key in judgements:
            # Sort
            sorted_values = sorted(judgements[key].items(), key=lambda kv: kv[1]["count"])
            
            # Normalise
            j = np.linspace(0, max_judgement, len(sorted_values))
            for i, item in enumerate(sorted_values):
                k, v = item
                judgements[key][k]["judgement"] = j[i]
            
            
        return judgements
    
search_stats = SearchStats()

In [21]:
MAX_SCORE = 4.0

def idealJudgement(num):
    i = 0
    incremenet = (1.0 / (float(num) - 1.0)) * num
    
    iJ = np.zeros(num)
    val = len(iJ)
    while (val > 0):
        iJ[i] = (val / float(num)) * MAX_SCORE
        i += 1
        val -= incremenet
        
    return iJ

def idealDiscountedCumulativeGain(num):
    idealGain = idealJudgement(num)
    iDCG = np.zeros(num)
    
    total = 0.0
    for i in range(num):
        total += idealGain[i] / float(i+1)
        iDCG[i] = total
    return iDCG

class NDCG(object):
    def __init__(self, judgements):
        self.judgements = judgements
        
    def dcg(self):        
        
        dcg_dict = {}
        for key in self.judgements:
            dcg_dict[key] = {"dcg": [], "urls": []}
            
        for key in dcg_dict:
            total = 0.0
            judgements = self.judgements[key]
            for url in judgements:
                judgement = judgements[url]
                total += judgement["judgement"] / float(judgement["rank"])
                dcg_dict[key]["dcg"].append(total)
                dcg_dict[key]["urls"].append(url)
            
        return dcg_dict
    
    def ndcg(self):
        
        dcg_dict = self.dcg()
        ndcg_dict = {}
        
        for key in dcg_dict.keys():
            dcg_data = dcg_dict[key]
            dcg = dcg_data["dcg"]
            
            idcg = idealDiscountedCumulativeGain(len(dcg))

            ndcg = np.zeros(len(dcg))

            for i in range(len(ndcg)):
                ndcg[i] = min(1.0, dcg[i] / idcg[i])
                
            ndcg_dict[key] = {}
            ndcg_dict[key]["ndcg"] = ndcg
            ndcg_dict[key]["urls"] = dcg_data["urls"]
        return ndcg_dict
    
    def __iter__(self):
        return self.judgements.__iter__()
    
    def __getitem__(self, i):
        return self.judgements[i]
    
    def __len__(self):
        return len(self.judgements)
        
ndcg = NDCG(search_stats.judgements())

In [22]:
docs_by_terms = search_stats.group_by_search_term()

In [23]:
class RegressionTest(object):
    """
    Tests that babbage and external search (not conceptual API) give the same results
    """
    def __init__(self, search_terms):
        self.search_terms = search_terms
        
    def run(self):
        from json import JSONDecodeError
        
        br = BabbageRequest()
        sr = SearchRequest()
        
        for search_term in self.search_terms:
            page = 1
            while True:
                try:
                    babbage_results = br.get_hits(search_term, page)
                    search_results = sr.get_hits(search_term, page)

                    assert babbage_results == search_results, "Regression test failed for query '%s' on page %d" % (search_term, page)
                    page += 1
                except JSONDecodeError:
                    # No more pages available
                    print(search_term + ":", "PASS (total pages=%d)" % page)
                    break
        print("TEST COMPLETE")

In [24]:
search_terms = search_stats.group_by_search_term().keys()

regression_tester = RegressionTest(search_terms)
regression_tester.run()

rpi: PASS (total pages=42)
gender pay gap: PASS (total pages=25)
cpi: PASS (total pages=162)
gdp: PASS (total pages=53)
inflation: PASS (total pages=183)
crime: PASS (total pages=14)
unemployment: PASS (total pages=53)
population: PASS (total pages=83)
immigration: PASS (total pages=20)
mental health: PASS (total pages=64)
london: PASS (total pages=47)
london population: PASS (total pages=120)
retail price index: PASS (total pages=482)
life expectancy: PASS (total pages=38)
obesity: PASS (total pages=2)
religion: PASS (total pages=8)
migration: PASS (total pages=24)
poverty: PASS (total pages=3)
social media: PASS (total pages=97)
employment: PASS (total pages=138)
TEST COMPLETE
