In [102]:
from gensim.utils import tokenize
from langdetect import detect
import string
from itertools import islice
import numpy as np
from scipy.sparse import csr_matrix
import smart_open
import faiss
import pandas as pd

import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.ml.linalg import Vectors, VectorUDT
from collections import defaultdict

import pickle
import graphene

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
class Company(graphene.ObjectType):
    id = graphene.ID()
    name = graphene.String()
    country = graphene.String()
    sector = graphene.String()
    revenue = graphene.Float()
    ebitda = graphene.Float()
    num_employees = graphene.Int()
    score = graphene.String()
    region = graphene.String()
    description = graphene.String()
    company_type = graphene.String()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
model_path = "s3://onai-ml-dev-eu-west-1/company2vec/model/svd_desc_only"
word2id_path = "s3://onai-ml-dev-eu-west-1/company2vec/common"
data_path = "s3://onai-ml-dev-eu-west-1/company2vec/data"

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
word2id = {}
id2word = {}
with smart_open.open(f"{word2id_path}/bow/word2id.csv", "r") as f:
    for line in f:
        word,idd = line.strip().split(",")
        word2id[word] = int(idd)
        id2word[int(idd)] = word

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
with smart_open.open(f"{model_path}/V/mat.np", "rb") as f:
    V = np.load(f)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
len(word2id)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

42829

In [9]:
with smart_open.open(f"{model_path}/X/X_pca.np", "rb") as f:
    X_pca = np.load(f)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [71]:
description = pd.read_parquet(f"{model_path}/X/company_info.pd").fillna(0.0)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
n,d = X_pca.shape

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
index = faiss.IndexFlatL2(d)
index.add(X_pca)
print(index.ntotal)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

3385668

In [16]:
company2id = {}
id2company = {}
for i,desc in description.loc[:, ["company_id"]].iterrows():
    company2id[desc["company_id"]] = i
    id2company[i] = desc["company_id"]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [17]:
public_labels = {
    2444485: [9429145, 93196, 9870522, 7914436],
    645782: [380011, 392154, 5523392],
    875173: [237655379, 931146, 418171],
    813607: [100231, 357076],
    380011: [357076, 324490, 93339],
    882889: [127202, 5478606, 1025390, 645782],
    46329052: [189096, 915379, 46895276, 877008, 325290136, 20524024, 271958947, 21852987, 26363560, 110104150],
    106335: [319676, 377732, 61206100],
    877769: [874042, 780678, 953488, 883809, 875295, 874186, 874119],
    34049884: [882155, 30428758, 315394, 23037669, 27561],
    254287477: [883300, 30614595],
    5600108: [285880, 5433540, 878697, 35650, 688262, 226852452],
    876031: [410182, 874470, 874191, 879732, 5395336],
    883752: [880697, 65340486, 26320074, 883327, 1034090, 257501324],
    5920885: [1494039, 268074105, 34534627, 20385800, 23000545, 124640],
    628413: [272054403, 91192, 309779, 140283, 138644],
    364040: [381388, 184945, 874170, 42751952, 874183, 314896, 5126590, 841504],
    257501324: [35000, 47320264, 253748612, 85076655, 32053, 12144785, 8186273, 9934160, 557267859],
    695204: [35303, 274561, 683719, 370857, 561001, 874022, 387473, 394038, 8274485, 12188205],
    30614595: [883300, 254287477, 9956099, 380011, 27868703, 2386697, 126857],
    28224119: [26824144, 35023689, 386639, 393661],
    32449506: [875260, 27169270, 5629762, 26014489, 286119],
    233324810: [874864, 159230, 27860587, 35806, 876981],
    879554: [5487000, 236715563, 412090459, 875192, 278679, 180871, 22516334, 30274893, 5478907],
    5580060: [118474533, 1779941, 265154, 10405454],
    23335317: [7885406, 277444, 278933, 8983678, 874143, 409119],
    381865672: [874842, 410366, 873649, 275789, 882473],
    937352: [876758, 879422, 128861678, 6461781, 1859063],
    874119: [881803, 875849, 231533, 877769, 780678, 953488, 875295, 874042, 775001, 874186],
    680934: [135398, 882299, 668578, 4481676, 32012],
    2248076: [141249, 4975204, 98876, 21828553],
    3606442: [882547, 4509042, 20703565, 7435035, 94799, 288033, 359868],
    877235: [295170, 175265, 874520, 410366, 873649, 874977, 167945, 8090046],
    84148802: [275789, 30339992, 5533238, 5718736],
    5523392: [645782, 11809880, 1353107, 962864],
    413744: [409932, 875491, 109303666, 91638],
    314896: [330589, 34768, 184945],
    5126590: [874855, 631781, 364040, 831357, 874170],
    377732: [319676, 106772, 106335, 704634, 320105, 874828, 873861, 1519242, 533853947],
    874652: [377732, 319676, 106772, 704634, 312375, 278933, 874828, 4863668]
}

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [98]:
def find_closest_revenue(candidate, revenue, ebitda):
    if candidate.revenue == 0 or candidate.ebitda == 0:
        return candidate.score * 100

    if candidate.revenue < revenue:
        score_revenue = 1 - candidate.revenue/revenue
    else:
        score_revenue = 1 - revenue/candidate.revenue

    if candidate.ebitda < ebitda:
        score_ebitda = 1 - candidate.ebitda/ebitda
    else:
        score_ebitda = 1 - ebitda/candidate.ebitda

    return (score_ebitda + score_revenue) / 2


def revenue_reranker(results, topn=100):
    query = results[0]
    return sorted(results[1:(topn+1)],
                  key=lambda x: find_closest_revenue(x,
                                                     query.revenue,
                                                     query.ebitda
                                                     )
                  )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [104]:
def query_peers(query_company, result_size=100, reranker=lambda res: res):
    query_vector = X_pca[np.newaxis, company2id[query_company]]

    desc_info = description.loc[company2id[query_company]]
    industry = desc_info["sic_code"]
    country = desc_info["country"]
    
    D, I = index.search(query_vector, result_size)

    ret = []
    added = set()
    for distance,idx in zip(D[0],I[0]):
        id = id2company[idx]
        
        # TODO: fix this to use the pandas dataframe instead
        
        score = distance
        cname = description.loc[idx, "company_name"]
        sector = description.loc[idx, "sic_code_desc"]
        country = description.loc[idx, "country"]
        company_type = description.loc[idx, "company_type"]

        revenue = description.loc[idx, "latest_revenue"]
        num_employees = description.loc[idx, "number_of_employees"]

        region = description.loc[idx, "region"]

        company_description = description.loc[idx, "company_description"]

        ebitda = description.loc[idx, "latest_ebitda"]

        ret.append(
            Company(
                id=id,
                name=cname,
                revenue=revenue,
                ebitda=ebitda,
                sector=sector,
                num_employees=num_employees,
                country=country,
                score=score,
                region=region,
                description=company_description,
                company_type=company_type,
            )
        )

    return reranker(ret)

def find_peer_in_result(results, peer):
    for i, company in enumerate(results):
        if peer == int(company.id):
            return i+1

    return -1


def add_ranks_in_tops(ranks, counts):
    for rank in ranks:
        if 0 < rank <= 10:
            counts[0] += 1
        elif 10 < rank <= 20:
            counts[1] += 1
        elif 20 < rank <= 50:
            counts[2] += 1
        elif 50 < rank <= 100:
            counts[3] += 1
        elif 100 < rank:
            counts[4] += 1
        else:
            counts[5] += 1
    return counts

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [100]:
for el in query_peers(106335, 500, revenue_reranker):
    print(el.name, el.id, el.revenue, el.ebitda)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

General Motors Company 61206100 147049.0 0.0
CarMax, Inc. 356805 17976.809 0.0
AutoNation, Inc. 112302 21412.8 0.0
America's Car-Mart, Inc. 303785 612.201 0.0
DriveTime Automotive Group, Inc. 96946 0.0 0.0
Renault SA 428034 63713.93697292499 0.0
Asbury Automotive Group, Inc. 25109 6874.4 0.0
Major Automotive Companies Inc. 377178 0.0 0.0
PACCAR Inc 294721 23495.7 0.0
PACCAR Financial Corp. 3062341 571.4 0.0
Exeter Finance Corporation 40788185 202.409 0.0
FCA US LLC 99130627 0.0 0.0
Hometown Auto Retailers, Inc. 388797 0.0 0.0
Bayerische Motoren Werke Aktiengesellschaft 704634 108166.88859298713 0.0
Honda Motor Co., Ltd. 278933 144261.84953184138 0.0
Credit Acceptance Corporation 318561 1072.3 0.0
Lithia Motors, Inc. 355432 11821.4 0.0
Cross-Continent Auto 355550 0.0 0.0
American Honda Finance Corporation 4234924 7581.0 0.0
Uag Mentor Acquisition LLC 5027805 0.0 0.0
Blue Bird Corporation 272634010 1024.976 0.0
CNH Industrial Capital LLC 140707156 521.375 0.0
Sonic Automotive, Inc. 37012

In [105]:
tops = [0] * 6 # top 10, 20, 50, 100, >100, -1

mrr = 0
count = 0

ranks_default = []
missed_peers = defaultdict(list)
for public_label,peers in public_labels.items():
    query_id = public_label
    results = query_peers(query_id, 500, revenue_reranker)
    ranks = []
    
    for peer in peers:
        peer_location = find_peer_in_result(results, peer)
        ranks.append(peer_location)
        ranks_default.append(peer_location)
        if peer_location == -1:
            missed_peers[public_label].append(peer)

    tops = add_ranks_in_tops(ranks, tops)

    mrr += sum([1/rank for rank in ranks if rank > 0])
    count += sum(1 for rank in ranks if rank > 0)

    ranks.insert(0, query_id)
    
    print(ranks)
    print()

print(tops)
print(np.mean([1/rank for rank in ranks_default if 0 < rank <= 20]))
print()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[2444485, -1, -1, -1, -1]

[645782, -1, -1, -1]

[875173, -1, -1, -1]

[813607, -1, -1]

[380011, -1, -1, -1]

[882889, -1, -1, -1, 33]

[46329052, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]

[106335, -1, 25, 1]

[877769, 71, 3, -1, 30, 17, -1, 86]

[34049884, 31, -1, -1, -1, -1]

[254287477, -1, -1]

[5600108, -1, -1, -1, -1, -1, -1]

[876031, -1, -1, -1, 63, -1]

[883752, -1, -1, -1, -1, -1, -1]

[5920885, -1, -1, -1, 11, -1, -1]

[628413, -1, -1, -1, -1, -1]

[364040, 22, 5, 90, -1, 23, 4, -1, 39]

[257501324, -1, -1, -1, -1, -1, -1, -1, 29, -1]

[695204, 1, -1, -1, 20, -1, -1, -1, -1, -1, -1]

[30614595, -1, -1, -1, -1, 21, -1, -1]

[28224119, -1, -1, -1, -1]

[32449506, -1, -1, -1, -1, -1]

[233324810, -1, -1, -1, -1, -1]

[879554, -1, -1, -1, -1, -1, -1, -1, -1, -1]

[5580060, -1, -1, -1, -1]

[23335317, -1, -1, -1, -1, -1, -1]

[381865672, -1, -1, -1, -1, -1]

[937352, -1, -1, -1, -1, -1]

[874119, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1]

[680934, 9, 16, 1, 7, -1]

[2248076, -1, 21,

In [20]:
tops = [0] * 6 # top 10, 20, 50, 100, >100, -1

mrr = 0
count = 0

ranks_default = []
for public_label in public_labels:
    query_id = public_label[0]
    results = [el[1][0] for el in query_peers(query_id, 500)]
    ranks = []
    
    for peer in public_label[1:]:
        ranks.append(find_peer_in_result(results, peer))
        ranks_default.append(find_peer_in_result(results, peer))

    tops = add_ranks_in_tops(ranks, tops)

    mrr += sum([1/rank for rank in ranks if rank > 0])
    count += sum(1 for rank in ranks if rank > 0)

    ranks.insert(0, query_id)
    #
    
    print(ranks)
    print()

print(tops)
print(np.mean([1/rank for rank in ranks_default if 0 < rank <= 20]))
print()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[2444485, 82, -1, 410, -1]

[645782, 403, -1]

[875173, 121, 406]

[813607, -1, 10]

[380011, 443, 58]

[882889, 4, -1, -1, 48]

[46329052, -1, -1, -1, -1]

[106335, 291, -1, 2]

[877769, 18, 103, 25, -1, 75]

[34049884, 6, -1, -1]

[254287477, -1, -1]

[5600108, -1, -1, -1, -1]

[876031, -1, 405, 117, 63, 322]

[4, 1, 2, 4, 10, 21]
0.21444444444444444