In [1]:
# define helper functions

def calculateTruePositives(calculatedRelevances, givenRelevances):
    TP = 0  # True Positives count
    for i in calculatedRelevances:
        if i in givenRelevances: TP = TP + 1
            
    return TP

def calculatePrecision(calculatedRelevances, givenRelevances):
    TP = calculateTruePositives(calculatedRelevances, givenRelevances)
    FP = len(calculatedRelevances) - TP   # False Positives count
    
    P = TP/(TP + FP)
            
    return P

def calculateFalseNegatives(calculatedRelevances, givenRelevances):
    FN = 0  # False Negatives count (non-retrieved relevant documents)
    for i in givenRelevances:
        if i not in calculatedRelevances: FN = FN + 1
            
    return FN

def calculateRecall(calculatedRelevances, givenRelevances):
    TP = calculateTruePositives(calculatedRelevances, givenRelevances)
    FN = calculateFalseNegatives(calculatedRelevances, givenRelevances)
    
    R = TP/(TP + FN)
            
    return R

def calculateFMeasure(P, R):
    if ((P+R) == 0): F = 0
    else: F = 2 * ((P*R) / (P+R))
    return F

In [2]:
import numpy as np

TOP_COUNT = 10

def calculateRelevanceScoreForFile(queryFileNumber, vectorizer, similarityFunction):
    # prepare corpus
    corpus = []
    for d in range(1400):
        f = open("./input/d/"+str(d+1)+".txt")
        corpus.append(f.read())
        
    # add query to corpus
    queryFile = open("./input/q/" + str(queryFileNumber) + ".txt")
    corpus.append(queryFile.read())

    # prepare matrix
    matrix = vectorizer.fit_transform(corpus)
 
    # compute similarity between query and all docs
    sim = np.array(similarityFunction(matrix[len(corpus)-1], matrix[0:(len(corpus)-1)])[0])
    topRelevant = sim.argsort()[-TOP_COUNT:][::-1]+1
    
    return topRelevant

In [3]:
def calculateRelevanceScores(vectorizer, similarityFunction):
    
    print("Top relevant using " 
          + str(type(vectorizer)) + ", " 
          + str(similarityFunction) + ":")
    
    print("query number: P, R, F;\t calc. relevances, given relevances")
    
    for q in range(225):
        fileNumber = str(q+1)
        f = open("./input/q/" + fileNumber + ".txt")
        r = open("./input/r/" + fileNumber + ".txt")
        
        givenRelevances = [int(x.strip()) for x in r.readlines()]
        calculatedRelevances = calculateRelevanceScoreForFile(fileNumber, vectorizer, similarityFunction)
        
        P = calculatePrecision(calculatedRelevances, givenRelevances)
        R = calculateRecall(calculatedRelevances, givenRelevances)
        
        F = calculateFMeasure(P, R)
        
        print("{}: {}, {}, {};\t{}".format(fileNumber, P, R, F, calculatedRelevances))

## Euclidean distance

In [4]:
# Binary representation, Euclidean distance
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics.pairwise import euclidean_distances

calculateRelevanceScores(HashingVectorizer(binary=True), euclidean_distances)

Top relevant using <class 'sklearn.feature_extraction.text.HashingVectorizer'>, <function euclidean_distances at 0x000002823410D048>:
query number: P, R, F;	 calc. relevances, given relevances
1: 0.0, 0.0, 0;	[ 433  620 1382 1191 1264  806  342 1117  677 1078]
2: 0.0, 0.0, 0;	[1017 1210 1353 1033  356  769 1223  775 1085  107]
3: 0.0, 0.0, 0;	[ 517  382 1266  752  300 1347  332   58 1304  694]
4: 0.0, 0.0, 0;	[ 126  405   93  833  908  424  915  579 1207  733]
5: 0.0, 0.0, 0;	[ 748  513  947 1098  291  647  672   41  737  578]
6: 0.0, 0.0, 0;	[ 877  492  848  107  551 1142 1276  408  853 1071]
7: 0.0, 0.0, 0;	[ 405  706  340   21  462  501   66 1394  417  393]
8: 0.0, 0.0, 0;	[1266  834  382  344 1294 1207  538 1156   37 1098]
9: 0.0, 0.0, 0;	[ 717  620  867  641 1399  724  727 1041   68 1000]
10: 0.0, 0.0, 0;	[ 769   46  153 1223  301  385  393  539   26  324]
11: 0.0, 0.0, 0;	[ 405 1395  672  925 1066  816  259  501  963  482]
12: 0.0, 0.0, 0;	[ 488 1395 1067 1138  101  827  963 1137

118: 0.0, 0.0, 0;	[ 834  109  729  280  526  585 1353  213   99 1071]
119: 0.0, 0.0, 0;	[1395  963  712  483 1290  879 1384  611   40 1354]
120: 0.0, 0.0, 0;	[ 834 1266  947 1098  474  515 1143 1389  475  568]
121: 0.0, 0.0, 0;	[475 405 998  21 590 258 775 597 879 107]
122: 0.0, 0.0, 0;	[1356  452  492   92  786  152  211  432 1252  579]
123: 0.0, 0.0, 0;	[1395 1348  517 1067 1138  963  483  962  926  422]
124: 0.0, 0.0, 0;	[ 492  405  913  849  509  694 1140 1055  223  271]
125: 0.0, 0.0, 0;	[ 506 1117 1135  861  654 1333   68  260  815 1330]
126: 0.0, 0.0, 0;	[ 486  189  818  214  964  603  565  245 1309 1191]
127: 0.0, 0.0, 0;	[1111  405  501  925  816  259  743  449 1253  335]
128: 0.0, 0.0, 0;	[1117 1191 1378  261  913  489 1103  513 1288  891]
129: 0.0, 0.0, 0;	[ 879  405   42  434  679  756  497  912 1257   66]
130: 0.0, 0.0, 0;	[ 834 1266  740  344  474  515 1143  405  773  811]
131: 0.0, 0.0, 0;	[ 301  400 1311  393  312  194  539  181 1176  286]
132: 0.0, 0.0, 0;	[ 517  834 1

As expected, Binary representation with Euclidean distance has not good precision and recall. I didn't expect all zeros though, maybe it is because of my ordering of the (top) results.

In [5]:
# pure Term Frequency, Euclidean distance
from sklearn.feature_extraction.text import CountVectorizer

calculateRelevanceScores(CountVectorizer(), euclidean_distances)

Top relevant using <class 'sklearn.feature_extraction.text.CountVectorizer'>, <function euclidean_distances at 0x000002823410D048>:
query number: P, R, F;	 calc. relevances, given relevances
1: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040   94  417   89]
2: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040   94  417   89]
3: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040  417   94   89]
4: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040   94  417   89]
5: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040   94  417   89]
6: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040  417   94   89]
7: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040  417   94   89]
8: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040   94  417   89]
9: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040  417   94   89]
10: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040  417   94   89]
11: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040  417   94   89]
12: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040   94  

113: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040   94  417   89]
114: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040  417   94   89]
115: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040   94  417   89]
116: 0.1, 0.16666666666666666, 0.125;	[1201 1313  798  329  927 1244 1040   94  417   89]
117: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040   94  417   89]
118: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040   94  417   89]
119: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040   94  417   89]
120: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040   94  417   89]
121: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040   94  417   89]
122: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040   94  417   89]
123: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040   94  417   89]
124: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040   94  417   89]
125: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040   94  417   89]
126: 0.0, 0.0, 0;	[1201 1313  798  329  927 1244 1040   94  417   89]


Pure Term Frequency with Euclidean distance is better, but still has pretty bad precision and recall. There must be some kind of miscalculation, as almost all results are the same. I have not been able to figure out the problem here though.

In [6]:
# TF-IDF, Euclidean distance
from sklearn.feature_extraction.text import TfidfVectorizer

calculateRelevanceScores(TfidfVectorizer(), euclidean_distances)

Top relevant using <class 'sklearn.feature_extraction.text.TfidfVectorizer'>, <function euclidean_distances at 0x000002823410D048>:
query number: P, R, F;	 calc. relevances, given relevances
1: 0.0, 0.0, 0;	[1400  970  541 1354 1353  579  634  645 1258  754]
2: 0.0, 0.0, 0;	[ 910  281  286  879 1045  492  532 1048 1293  769]
3: 0.0, 0.0, 0;	[ 517  752 1266  382  359 1210 1249  386  871  487]
4: 0.0, 0.0, 0;	[ 312  387  405   71  871  958  578 1070  483  590]
5: 0.0, 0.0, 0;	[1098  259  743  405 1358 1111 1368 1369  273  430]
6: 0.0, 0.0, 0;	[ 107  551 1276  848  853  877  492 1071 1142  408]
7: 0.0, 0.0, 0;	[393 405 834 483 875 879 968 437  71 392]
8: 0.0, 0.0, 0;	[ 834  382 1266  821  258  931 1090  281   91  875]
9: 0.0, 0.0, 0;	[ 961 1219  368  929 1178  804 1206 1400 1130 1053]
10: 0.0, 0.0, 0;	[ 526 1017 1276 1283  301  320 1223  393   26   46]
11: 0.0, 0.0, 0;	[ 879 1395  405  312  875  488  968 1067  343  949]
12: 0.0, 0.0, 0;	[1395  879 1067  875 1070  483   71  958  488 1090]


117: 0.0, 0.0, 0;	[1395 1348  517    5  387  834  312  483  393 1070]
118: 0.0, 0.0, 0;	[ 281 1395  586  834  879 1293  393  578 1102  963]
119: 0.0, 0.0, 0;	[ 879 1395  483  405  968  963  387  609  378 1348]
120: 0.0, 0.0, 0;	[ 834 1266  258 1348  821 1395  359 1090  393 1143]
121: 0.0, 0.0, 0;	[ 998  271 1152  590  408  405   10  965 1276  482]
122: 0.0, 0.0, 0;	[ 879    5  405 1102  517  590  437  958  945 1395]
123: 0.0, 0.0, 0;	[ 517 1395 1348 1067 1070  483   71 1102  578    5]
124: 0.0, 0.0, 0;	[ 405  879 1102  492  875  509    5  384  438  517]
125: 0.0, 0.0, 0;	[1206  235 1162  506  808 1219  446 1361  883  840]
126: 0.0, 0.0, 0;	[1197 1219 1282 1281 1278  208  541  554 1162  428]
127: 0.0, 0.0, 0;	[1111  405  501  925  877   41  400  203  672  879]
128: 0.0, 0.0, 0;	[1170  462  268  151  743  891  723   61 1117 1331]
129: 0.0, 0.0, 0;	[ 405  879  517 1132  708  910 1111  286  392 1314]
130: 0.0, 0.0, 0;	[1395  834 1067  281 1090  875 1143 1348  312 1266]
131: 0.0, 0.0, 0;	[ 

TF-IDF has some valid results too, but Eucledian distance is still a bad idea - if a query term occurs more times in the document, the distance will be larger and also distance is large, altough the distribution of terms are similar.

## Cosine similarity

Cosine similarity uses angle instead of distance (computes cosine of the angle between the query and document vectors). This means that long and short vectors now have comparable weights. A document ranks according to the angle of the query.

In [7]:
# Binary representation, Cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

calculateRelevanceScores(HashingVectorizer(binary=True), cosine_similarity)

Top relevant using <class 'sklearn.feature_extraction.text.HashingVectorizer'>, <function cosine_similarity at 0x000002823410D7B8>:
query number: P, R, F;	 calc. relevances, given relevances
1: 0.3, 0.10344827586206896, 0.15384615384615385;	[ 184   12   13 1268 1361 1003 1362  573  588  878]
2: 0.1, 0.04, 0.05714285714285714;	[  12  880  607  203 1063  700 1158  875 1111  672]
3: 0.4, 0.4444444444444444, 0.4210526315789474;	[181 399   5 350 485 285 387 861 378 281]
4: 0.1, 0.3333333333333333, 0.15384615384615383;	[ 166 1011  378 1085 1029 1190  543  291  332 1189]
5: 0.2, 0.4, 0.26666666666666666;	[ 103  943  355  670  488 1272 1102   26 1285 1296]
6: 0.0, 0.0, 0;	[ 817  544  387  485  228 1146  355  950 1139  678]
7: 0.1, 0.16666666666666666, 0.125;	[ 492  250  248  498 1231   10 1124  122 1006  106]
8: 0.3, 0.25, 0.2727272727272727;	[ 492  122  988   69  973  443  801   48  232 1297]
9: 0.3, 0.75, 0.4285714285714285;	[ 21  45 303 398  22  98 539 270 378 550]
10: 0.3, 0.33333333333333

97: 0.0, 0.0, 0;	[ 906  728  882 1045 1171  745  251  507  935  920]
98: 0.1, 0.16666666666666666, 0.125;	[ 638  521 1084  935  228 1323  371  291  236 1080]
99: 0.1, 0.2, 0.13333333333333333;	[832 745 153 835 817  32 403 639  42 253]
100: 0.3, 0.3, 0.3;	[1126  932 1131 1171 1030  760  822 1146 1122 1069]
101: 0.1, 0.14285714285714285, 0.11764705882352941;	[ 832  823 1060  817  953  223 1132 1120  460 1030]
102: 0.1, 0.2, 0.13333333333333333;	[ 910 1006  650   31 1007   67  286  516  461  998]
103: 0.1, 0.3333333333333333, 0.15384615384615383;	[ 906  320 1048  761 1030  882  265 1014  932  951]
104: 0.2, 0.3333333333333333, 0.25;	[ 241  835 1178  762  862  120 1024 1018  130  744]
105: 0.3, 0.5, 0.37499999999999994;	[ 953  848  887   26  847 1152  764 1023  224 1171]
106: 0.1, 0.16666666666666666, 0.125;	[1045  878  879  887  847 1145  552 1161  670  558]
107: 0.1, 0.125, 0.11111111111111112;	[ 909 1146  670  835  884 1367  882  220 1152 1062]
108: 0.1, 0.125, 0.11111111111111112;	[884

192: 0.3, 0.6, 0.4;	[ 735  875  647  641  551  745 1069  734 1359  386]
193: 0.4, 0.4, 0.4000000000000001;	[ 735  422  736  551  181 1030  733  398 1060  350]
194: 0.2, 0.2857142857142857, 0.23529411764705882;	[1293  642 1069 1029 1023 1046 1176  741  835  855]
195: 0.1, 0.25, 0.14285714285714288;	[ 642 1293  831 1055  887  863 1014 1045  932  938]
196: 0.1, 0.07692307692307693, 0.08695652173913043;	[1083  180    3  507   98    1  326  875  389  102]
197: 0.2, 0.5, 0.28571428571428575;	[ 884  723  768    6  726  543 1397 1318  775  724]
198: 0.1, 0.2, 0.13333333333333333;	[ 906 1030  320  594 1029 1171 1174  887   26  890]
199: 0.0, 0.0, 0;	[ 451  931  337  354 1118 1332 1059  474  739  597]
200: 0.0, 0.0, 0;	[ 854  935 1362  885  851  817  890 1029  400 1174]
201: 0.2, 0.11764705882352941, 0.14814814814814817;	[ 298 1296 1073  537  510  625   98 1379  598  396]
202: 0.2, 0.13333333333333333, 0.16;	[ 687 1285  920 1306  708  879 1378  663  214  925]
203: 0.1, 0.06666666666666667, 0.08;

Binary representation with cos. similarity has finally some valid results.

In [8]:
# pure Term Frequency, Cosine similarity

calculateRelevanceScores(CountVectorizer(), cosine_similarity)

Top relevant using <class 'sklearn.feature_extraction.text.CountVectorizer'>, <function cosine_similarity at 0x000002823410D7B8>:
query number: P, R, F;	 calc. relevances, given relevances
1: 0.5, 0.1724137931034483, 0.25641025641025644;	[184  13 429 578  51 588  12 430  14 376]
2: 0.2, 0.08, 0.11428571428571428;	[ 12 672 880 792 141 746  47 552 599 731]
3: 0.4, 0.4444444444444444, 0.4210526315789474;	[ 181  485  399  378  374  144  350 1204  354 1169]
4: 0.1, 0.3333333333333333, 0.15384615384615383;	[ 665  166  640  930 1224 1242  140  131   73  786]
5: 0.0, 0.0, 0;	[1374  503  542 1102   36  327  752  573  103  490]
6: 0.0, 0.0, 0;	[ 949  544   97 1007  379  657  489  469   59  455]
7: 0.2, 0.3333333333333333, 0.25;	[ 492 1231  122   32   56  640  354  197  709   62]
8: 0.3, 0.25, 0.2727272727272727;	[ 492  122  947 1231 1115  354  709 1179  708  427]
9: 0.3, 0.75, 0.4285714285714285;	[ 21 398 550 303  22 269 120 524 983 387]
10: 0.2, 0.2222222222222222, 0.2105263157894737;	[ 691  30

98: 0.0, 0.0, 0;	[674 376 606 315 745 704 672 371 979 935]
99: 0.0, 0.0, 0;	[ 376  184 1035 1387   36  403  469  962  752  152]
100: 0.4, 0.4, 0.4000000000000001;	[1131 1126  741  823 1122  739  760  822  642 1013]
101: 0.2, 0.2857142857142857, 0.23529411764705882;	[ 826  232   27  921  680  760 1075  825  987  801]
102: 0.1, 0.2, 0.13333333333333333;	[ 910  516  747  286  650 1065  124 1132   67  947]
103: 0.0, 0.0, 0;	[1238  770 1127  903   73   96  151  217  417  665]
104: 0.1, 0.16666666666666666, 0.125;	[1024   29  325  462  762 1362  980 1056  962  241]
105: 0.1, 0.16666666666666666, 0.125;	[ 96  73 665 166 170 445 786 151 764   8]
106: 0.1, 0.16666666666666666, 0.125;	[ 764 1290  954  635  236  908  594  178 1399  491]
107: 0.0, 0.0, 0;	[ 220 1333  928 1303  954  798  202  368 1122   78]
108: 0.2, 0.25, 0.22222222222222224;	[  96  723  881  151  951 1342    8  726  786  932]
109: 0.0, 0.0, 0;	[   5   31  391  864  859 1066   36  137  709 1219]
110: 0.0, 0.0, 0;	[ 334 1370  424  

195: 0.2, 0.5, 0.28571428571428575;	[ 642   73  734 1127  739  341  157   28  927   89]
196: 0.0, 0.0, 0;	[ 927  297  665 1337  442   73  277  799  170  222]
197: 0.2, 0.5, 0.28571428571428575;	[ 884  292  410 1201  903  150  768  381 1183 1240]
198: 0.0, 0.0, 0;	[1040  798 1238  308  217  797   73  334 1251  192]
199: 0.1, 0.1111111111111111, 0.10526315789473685;	[1118  474  451  374  748 1070 1055  378  544  931]
200: 0.0, 0.0, 0;	[ 885 1052  195 1399  739  824  622 1175 1362 1181]
201: 0.1, 0.058823529411764705, 0.07407407407407408;	[1379   89 1209 1072  625  410  452   73  531  927]
202: 0.1, 0.06666666666666667, 0.08;	[  41 1306  139  708 1324  568 1256  976  755 1213]
203: 0.0, 0.0, 0;	[1336 1073 1333  814  482  225  179  395 1018  202]
204: 0.1, 0.06666666666666667, 0.08;	[ 971 1227  326  513  573  310 1378 1133 1080 1311]
205: 0.0, 0.0, 0;	[ 72 899 335  71   3 457 664 336 326 376]
206: 0.1, 0.25, 0.14285714285714288;	[1290  635 1204 1303  441 1153  178 1121 1157 1181]
207: 0.3,

Pure Term Frequency with Cosine similarity is obviously better than the binary representation.

In [9]:
# TF-IDF, Cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer

calculateRelevanceScores(TfidfVectorizer(), cosine_similarity)

Top relevant using <class 'sklearn.feature_extraction.text.TfidfVectorizer'>, <function cosine_similarity at 0x000002823410D7B8>:
query number: P, R, F;	 calc. relevances, given relevances
1: 0.5, 0.1724137931034483, 0.25641025641025644;	[  13  184   51   12  486  359  792  327 1268  429]
2: 0.4, 0.16, 0.22857142857142856;	[ 12  51 792 746 184 884 875 100 726 578]
3: 0.7, 0.7777777777777778, 0.7368421052631577;	[485 181 144   5 399 542  90 707  91 350]
4: 0.2, 0.6666666666666666, 0.30769230769230765;	[ 166 1275  185 1189  317  575  236 1242  827 1061]
5: 0.2, 0.4, 0.26666666666666666;	[ 103 1374  943 1102  540   26  552  401  360  575]
6: 0.0, 0.0, 0;	[ 472  775 1110  148  544  607  883  228 1363 1380]
7: 0.2, 0.3333333333333333, 0.25;	[ 492 1040 1231   56  122  434  354  947 1347  197]
8: 0.2, 0.16666666666666666, 0.1818181818181818;	[ 492  122  907  569  237  711   21  443 1231  947]
9: 0.3, 0.75, 0.4285714285714285;	[ 21  22 550 326 571 306 528 564 398 102]
10: 0.2, 0.22222222222222

93: 0.2, 1.0, 0.33333333333333337;	[ 635  355  691  302   68 1241  628  413  548   37]
94: 0.5, 0.38461538461538464, 0.4347826086956522;	[1393  559  283  564 1161  366  689  662  983  668]
95: 0.3, 1.0, 0.4615384615384615;	[ 662  283  564 1395  635  628   36 1393  101  370]
96: 0.4, 0.2857142857142857, 0.3333333333333333;	[ 637  701  699  698 1259  903  360  683 1332 1109]
97: 0.0, 0.0, 0;	[  36 1214 1270  329  728  807  409  251  953 1165]
98: 0.1, 0.16666666666666666, 0.125;	[1343  638  311  288  182 1393  315 1080 1211  376]
99: 0.1, 0.2, 0.13333333333333333;	[ 376  914  962   36  414  228  403 1379   94   77]
100: 0.5, 0.5, 0.5;	[1126 1122 1171  822 1013 1131 1051  823  739  760]
101: 0.4, 0.5714285714285714, 0.47058823529411764;	[ 817  826  760  825  820 1119  823  680  801  232]
102: 0.2, 0.4, 0.26666666666666666;	[ 910  660  516  998  728 1001  497  286 1361 1006]
103: 0.1, 0.3333333333333333, 0.15384615384615383;	[ 761  770  720 1048 1127 1049  951 1126  669 1214]
104: 0.1, 0.1

187: 0.1, 0.14285714285714285, 0.11764705882352941;	[ 743  841  889 1052  739  887  826  885  741  740]
188: 0.4, 0.3333333333333333, 0.3636363636363636;	[ 725  640  909  722 1270  497  137 1360  811  220]
189: 0.3, 0.3, 0.3;	[768 767 726 865 881 883 870 640 581 884]
190: 0.3, 0.5, 0.37499999999999994;	[ 390 1339  391   15  856  858   52  895  627  723]
191: 0.8, 0.5714285714285714, 0.6666666666666666;	[ 391  858  856  658  948  859  857 1008  894 1244]
192: 0.3, 0.6, 0.4;	[ 735  641  875  647  648 1202  388  745  734  215]
193: 0.7, 0.7, 0.7;	[ 730  735  733  736  641 1392  422  424  425  111]
194: 0.4, 0.5714285714285714, 0.47058823529411764;	[ 642  888  823 1131  744  760  741  885  739 1017]
195: 0.2, 0.5, 0.28571428571428575;	[ 642 1055  739  831 1054 1294 1131 1171 1382   28]
196: 0.0, 0.0, 0;	[ 540  442  349  966 1281 1337  359 1274  158  124]
197: 0.4, 1.0, 0.5714285714285715;	[ 884  881  768  726  723  746  865  883 1168   75]
198: 0.1, 0.2, 0.13333333333333333;	[ 889  743  73

As expected, the combination of TF-IDF and Cosine similarity returns the most valid results.

## Comments

> issues during the design/implementation:

As I am still a python newbie, I was struggling with its syntax, functions and so on (e.g. how to define own function, debugging, array functions and so on).

I could not resolve some problems when calculating Euclidean distance (Binary representation returning only zero P/R/F, TF returning the same results regardless the query).



> ideas for extensions/improvements/future work

Now I have a constant number of results (TOP_COUNT = 10). I would work on returning the results based on calculation dynamically (e.g. for binary representation, true/false).
I would also add an ordering based on the best P/R/F.