In [623]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from collections import namedtuple, defaultdict

# Data Pull

In [624]:
with open('raw_data.txt') as f:
    lines = f.readlines()
lines = [x.strip().split('\t') for x in lines]

In [625]:
_map_ids = {}
_map_names = {}
def name_to_cid(name):
    if name not in _map_ids:
        _map_ids[name] = len(_map_ids)
        _map_names[_map_ids[name]] = name
    return _map_ids[name]

def num_cids():
    return len(_map_ids)

def cid_to_name(cid):
    if cid not in _map_names:
        _map_ids[name] = len(_map_ids)
        _map_names[_map_ids[name]] = name
    return _map_names[cid]

# Graph of all the ids, where G[x] is the x'th student's resume
G = np.asarray([np.asarray([name_to_cid(c) for c in row]) for row in lines])

# Ranking

In [626]:
def _dcg(r, variant):
    if variant == 1:
        return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
    else:
        return np.sum((np.exp2(r) - 1.0) / np.log2(np.arange(2, r.size + 2)))

def _ndcg(r, variant):
    # O(R log R)
    sorted_r = np.sort(r)[::-1]
    return _dcg(r, variant) / _dcg(sorted_r, variant)
    
def ndcg(resume, relevance_model, variant=1):
    '''Args:
        variant=1 is  \mathrm{DCG_{p}} = rel_1 + \sum_{i=2}^{p} \frac{rel_{i}}{\log_{2}(i)}
        variant=2 is  \mathrm{DCG_{p}} = \sum_{i=1}^{p} \frac{ 2^{rel_{i}} - 1 }{ \log_{2}(i+1)}
        resume: cids in chronological order
        relevance_model: map of cid to its ranking
    '''
    # O(R log R)
    assert variant in [1, 2], '[1, 2] only valid options'
    # resume relevances in decreasing order
    relevances = np.asfarray([relevance_model[cid] for cid in reversed(resume)])
    tmp = sorted(list(set(relevances))
    return _ndcg(relevances, variant)

In [627]:
ranking = np.arange(num_cids())
np.random.shuffle(ranking)
_save_point = ranking

In [638]:
def get_ranking(ranking, descending=True):
    # O(C)
    names = [cid_to_name(cid) for cid in ranking]
    if descending:
        names.reverse()
    return names

def get_relevance_model(ranking):
    # Maps cid -> relevance
    # O(C)
    relevance_model = {}
    for i, cid in enumerate(ranking):
        relevance_model[cid] = i
    return relevance_model

def score(ranking):
    # O(C + G * R log R)
    model = get_relevance_model(ranking)
    _score = sum([ndcg(row, model) for row in G])
    return _score

def optimize(ranking, iterations=1, print_every=20):
    moves_counter = 0
    for it in range(iterations):
        print("Iteration", it)
        old_score = score(ranking)
        # insertion
        for x in range(0, len(ranking)-1):
            for y in range(x+1, len(ranking)):
                ranking[x], ranking[y] = ranking[y], ranking[x]
                test_score = score(ranking)
                if test_score > old_score:
                    old_score = test_score
                    global _save_point
                    _save_point = ranking.copy()
                    if moves_counter % print_every == 0:
                        print(get_ranking(ranking)[:20])
                    moves_counter += 1
                else:
                    ranking[x], ranking[y] = ranking[y], ranking[x]

In [641]:
len(set(_save_point)), len(_map_ids), score(_save_point)

(2370, 2370, 1208.3856970294469)

In [640]:
optimize(_save_point, iterations=10, print_every=20)

Iteration 0
['Sterling Investment Group LP', 'City of Toronto Parks and Recreation', 'IBI Group', 'Shiq Technologies', 'ZeMind Studios', 'Well.ca', 'Cultures Restaurant', 'Malroz Engineering Inc.', 'Self-Employed Tutor', 'Wired Messenger Inc.', 'Epicor Software', 'Arroware Industries Inc.', 'GREE Canada', 'Tungsten Properties', 'Avon Engineering', 'TLS Tautenburg', 'Emerson', "Walker's Landing Pub & Eatery", 'Yellow Pages Group / Groupe Pages Jaunes â€“ Canada', 'Integrated Development Society (IDS) Nepal']




KeyboardInterrupt: 