In [8]:
# Necessary imports
import nltk
import math
import operator
from csv import reader

In [9]:
#Helper functions

""" 
For a given document, the function takes the raw counts of all the unique words 
found in the entire corpus and the respective tokens (words) in the document.

It then calculates the TF value for each token in the document and returns a 
dictionary object.

Datatype for raw counts (wordDict) is in the form a dictionary -> the keys in 
the dictionary correspond to unique words/ tokens in the entire corpus.

Datatype for document specific tokens (bagOfWords) is a list of tokens.
"""
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

""" 
For a given document in the corpus, the function creates a dictionary of all the
unique words in a corpus and their corresponding counter in that document.

uniqueWordsDataframe is a set of all the unique tokens within a corpus.

text is the raw text of an individual document.
"""
def get_num_words(text, uniqueWordsDataframe):
    num_words_dict = dict.fromkeys(uniqueWordsDataframe, 0)
    for word in text.split():
        word = word.lower()
        num_words_dict[word] +=1
    return num_words_dict

"""
For a given document, this function calculates the corresponding IDF value by
taking in the raw counts of words/ tokens in a document and then consequitively
calculate the IDF values for each word.

The function takes in a list of all the document-keyword-count dictionaries,
we get these by passing the documents through get_num_words function
"""
def computeIDF(documents):

    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

"""
This fnction takes the TF dictionary and the IDFs dictionary and calculates 
the corresponding TFIDF values for each document in the corpus

It returns a TFIDF dictionary for each document
"""
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

"""
The function below computes the cosine similarity for two lists v1 and v2
using the formula: (v1 dot v2)/{||v1||*||v2||)
"""
def cosine_similarity(v1,v2):
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumxx*sumyy)

"""
This is really similar to the compute IDF function except just returns zero for 
all the tokens that are not present in a document but are ther in a corpus.

Its ideal use is for a calculating IDF values for out of corpus documents.
"""
def computeIDF_singlequery(documents):
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        try:
            idfDict[word] = math.log(N / float(val))
        except:
            idfDict[word] = 0
    return idfDict

"""
This is really similar to the compute TFIDF function except just employs 
computeIDF_singlequery for IDF calcaulation to take care of out of vocabulary 
words

Its ideal use is for a calculating TFIDF values for out of corpus documents.
"""
def get_tfidf_ooc(text, uniqueWordsDataframe):
    num_words_dict_text = get_num_words(text, uniqueWordsDataframe)
    tf = computeTF(num_words_dict_text, text.lower().split())
    idf = computeIDF_singlequery([num_words_dict_text])
    return computeTFIDF(tf, idfs_list)

In [5]:
!cd .. && cd utils && cd data && cd processed_data && ls

dev.csv  train.csv


In [6]:
df = pd.read_csv("../utils/data/processed_data/train.csv")

In [11]:
# read csv file as a list of lists
with open('../utils/data/processed_data/train.csv', 'r') as read_obj:
    # pass the file object to reader() to get the reader object
    csv_reader = reader(read_obj)
    # Pass reader object to list() to get a list of lists
    list_of_rows = list(csv_reader)

In [12]:
list_of_rows

[['00050cbb-049e-444f-a17a-04c882da4693-1',
  'Chad went to get the wheel alignment measured on his car.',
  'The mechanic provided a working alignment with new body work.',
  'Chad was waiting for his car to be washed.',
  'Chad was waiting for his car to be finished.',
  '2'],
 ['00050cbb-049e-444f-a17a-04c882da4693-1',
  'Chad went to get the wheel alignment measured on his car.',
  'The mechanic provided a working alignment with new body work.',
  "Chad's car had all sorts of other problems besides alignment.",
  "Chad's car had all sorts of benefits other than being sexy.",
  '1'],
 ['00050cbb-049e-444f-a17a-04c882da4693-1',
  'Chad went to get the wheel alignment measured on his car.',
  'The mechanic provided a working alignment with new body work.',
  "Chad's mechanic said he had major alignment problems.",
  "Chad's mechanic said he found no problems with his car.",
  '1'],
 ['00050cbb-049e-444f-a17a-04c882da4693-1',
  'Chad went to get the wheel alignment measured on his car.

In [20]:
unique_words_set = set()
for row in list_of_rows:
    unique_words_set.update(66)
    unique_words_set.update([x.lower() for x in row[2].split(" ")])
    unique_words_set.update([x.lower() for x in row[3].split(" ")])
    unique_words_set.update([x.lower() for x in row[4].split(" ")])

In [21]:
len(unique_words_set)

44762

In [None]:
# For each document within the corpus we identify which all tokens (from the 
# master list of all tokens) are present in the document along with their 
# respective counts within the document.

num_words_dict_list_obs1 = []
for row in list_of_rows:
    num_words_dict_list_obs1.append(get_num_words(row[1], unique_words_set))

print("obs1 done")
    
num_words_dict_list_obs2 = []
for row in list_of_rows:
    num_words_dict_list_obs2.append(get_num_words(row[2], unique_words_set))

print("obs2 done")
    
num_words_dict_list_hyp1 = []
for row in list_of_rows:
    num_words_dict_list_hyp1.append(get_num_words(row[3], unique_words_set))

print("hyp1 done")
    
num_words_dict_list_hyp2 = []
for row in list_of_rows:
    num_words_dict_list_hyp2.append(get_num_words(row[4], unique_words_set))

print("hyp2 done")

In [None]:
# For each document within the corpus calculate the TF value and append the 
# resultant dictionary (for each document) into a list
# Note: Some documents in the corpus do not have a corresponding "news_text", 
# hence we do not consider them for our calculation

tf_list = []

for i in range(1000):
  try:
    tf_list.append(computeTF(num_words_dict_list[i], df["news_text"][i].lower().split()))
  except:
    print(i)
    pass

In [None]:
# For each document within the corpus calculate the IDF value and append the 
# resultant dictionary (for all documents)

idfs_list = computeIDF(num_words_dict_list)

In [None]:
# For each document in the corpus calculate the corresponding TFIDF value and 
# append it into a list

tfidf_list = []
for tf in tf_list:
  tfidf_list.append(computeTFIDF(tf, idfs_list))

In [None]:
# convert the list of tfidf dicts into a dataframe.

tfidf_df = pd.DataFrame(tfidf_list) 

In [None]:
# top 5 rows of the TFIDF dataframe

tfidf_df.head()

Unnamed: 0,vergitterung:,"mars,",gemalt.,fdp-wähler,zerstörte.,stützpunkt,washington/brüssel,lachen.,"nietzsche,",photos,iran-chef!,dutzend,das!,einfachheit,wüssten,nikab,geologen,99,kleinwagen,produktvorstellung,feige.,unangemessene,ablassen.,erläuterte,antichrist-wachtturm,einzulegen,"""mickrige""",zuversichtlich,errichtetes,studienplatz,streichen,gemeint,"ausdrücklich,",wenige,"augen-,",furcht,grimmiger,unnötigkeit,weiträumig,unpraktisch:,...,keynote,zusehends,durchfall-epidemie,verspäten,glätten:,"gedanke,","belegen,","wagte,",bezieht),wäre?,"untersagen,",lautet:,geschichte.,ausgedruckter,zurück:,"""suv-fahrer",unschlagbaren,enklavë,suizid,deniz,"ehrensache,",us-wahl,"geht,",guttenberg.,"einpacken,","versammelt,",ich-geh-lieber-grillon,entlarven,zeugnisse,placebo-effekt,ungerechtigkeiten,innere,krisenpolitik,"gründen.""","breiter""",verhütungszwecken,auflösung.,nützlich,em-titel?,wenigstens
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# A function to calculate the cosine similarity between two strings given by the 
# end user

def calculate_cosine_similarity(query1, query2, uniqueWordsDataframe):
  q1 = get_tfidf_ooc(query1, uniqueWordsDataframe)
  q1 = pd.DataFrame([q1]).values.tolist()
  q2 = get_tfidf_ooc(query2, uniqueWordsDataframe)
  q2 = pd.DataFrame([q2]).values.tolist()
  return cosine_similarity(q1[0], q2[0])

In [None]:
# Test run with two similar strings

calculate_cosine_similarity("Es kommentiert Edgar Bebeling", "Es kommentiert Edgar Bebeling", uniqueWordsDataframe)

1.0

In [None]:
# Test run with slight disimilarities in the strings

calculate_cosine_similarity("Es kommentiert Edgar Bebeling", "Es kommentiert Bebeling", uniqueWordsDataframe)

0.8033558574619695

In [None]:
# A function to return the cosine similarity scores for each document in the 
# corpus

def find_similar_articles(document, tfidf_df, uniqueWordsDataframe):
  document = get_tfidf_ooc(document, uniqueWordsDataframe)
  document = pd.DataFrame([document]).values.tolist()

  cosine_similarity_dict = {}
  for i in range(1000):
    candidate_document = pd.DataFrame(tfidf_df.iloc[i]).transpose().values.tolist()
    try:
      cos_sim_val = cosine_similarity(candidate_document[0], document[0])
    except:
      cos_sim_val = 0
    cosine_similarity_dict[i] = cos_sim_val

  return cosine_similarity_dict

In [None]:
# Test Case: 1 - 1st Document from the corpus
text = '''Es kommentiert Edgar Bebeling (CDU), Mitglied der Enquete-Kommion "Zweiräder im Straßenverkehr": 
Die aktuellen Diskuonen über Radwege, Verkehrspolitik und Mobilität verfügen über alle  Elemente, um - endlich? - 
den lang erwarteten und von einigen vielleicht  ersehnten "Clash of Cyclists" zu provozieren. 
Es ist der Kampf  zwischen der schönen neuen Flachfahrradwelt und dem realen Leben. 
Während  die "Flachfahrradfahrer" den realen Hochradfahrer zum Dinosaurier erklären,  
vergessen sie dabei, dass es sich bei dieser Lebensform um die große  Mehrheit der Menschen handelt. 
Auf Mehrheitsverhältnisse haben Revolutionen indessen nie wirklich Rücksicht genommen.  
Die Schlachtordnung der letzten Tage erweckt den Eindruck,  wir seien im dritten Teil von "Der Herr der Zweiräder" 
angekommen,  und der Endkampf um die Radwege stehe bevor. Das ist die Gelegenheit,  schon jetzt einen vorgezogenen Nachruf 
auf die Radfahrer, die Kämpfer für gleich große Vorder- und Hinterräder zu formulieren. Denn, liebe "Fahrradfahrer": 
Ihr werdet den Kampf verlieren. Und das ist nicht die  Offenbarung eines einsamen Apokalyptikers, es ist die Perspektive eines 
geschichtsbewussten Hochradfahrers. Auch die Zweirad-Revolution wird ihre  Kinder entlassen. Und das flache Fahrrad wird bald Geschichte sein. 
Es stellt  sich nur die Frage, wie viel Fußgängerblut bis dahin vergossen wird.   Versteht die Welt nicht mehr: Bebeling auf seinem Hochrad 
Denn  es ist Aufmerksamkeit geboten. Auch wenn das flache Fahrrad als imaginäres  Lebensgefühl einer verlorenen Generation schon bald Geschichte 
sein mag,  so hat es allemal das Zeug zum Destruktiven. Wenn wir nicht wollen,  dass sich nach dem Abzug der Radfahrerhorden und des 
Schlachtennebels  nur noch die ruinenhaften Stümpfe unserer Gesellschaft in die Sonne  recken und wir auf der holprigen, 
verbrannten Erde unserer Kultur mit unseren Hochrädern fahren müssen,  dann heißt es, jetzt wachsam zu sein. 
Also, Großbürger, auf zur Wacht! Es  lohnt sich, unser Recht auf das Hochradfahren auch im Straßenverkehr zu verteidigen! 
Die Gesellschaft der Hochradfahrer mit ihren Zylindern, Monokeln und Fräcken hat sich in mühevoller Arbeit aus den Barrikaden der 
Französischen Revolution heraus geformt – so entstand der Velozipedist. Und  genau dort, in den Gassen von Paris im Jahr 1789, wurde die 
Idee des Hochradfahrens geboren. Welche Errungenschaft wider das Zufußgehen und Reiten des Ancien Régime! Endlich konnte man - abhängig von 
Herkunft und Status - mit seinem teuren Hochrad stolz und erhobenen Hauptes durch die Gegend radeln. Diese Idee des Hochrades sollte sich als 
pedalbetriebener Motor für  Innovation und Entwicklung auf dem europäischen Kontinent erweisen. Eine Fortbewegungsart, deren Bewahrung auch im 
Zeitalter des flachen Fahrrades lohnt. Sie ist im modernen Straßenverkehr in Gefahr. Nicht weil flache Fahrräder aus sich heraus 
wie kleine Drahtesel an den Ideen und Idealen unserer großbürgerlichen  Gesellschaft knabbern würden. Nein, es sind die Menschen, 
die auf diesen widerlich niedrigen Fahrrädern in gebückter Haltung sitzen und eine andere Mobilität wollen. Die die totale  Freiheit 
der Fahrzeugwahl apostrophieren und damit letztlich nur den "velozipedalen  Totalitarismus", wie es Jaron Lavier genannt hat, meinen. 
Es ist eine  unheilige Allianz aus diesen "radelnden Maoisten" und kapitalstarken Velozipedisten, die hier am Werk ist. 
Auch wenn sie sagen, sie seien die  Guten - nur weil man sagt, man sei gut, ist man es noch lange nicht. Nun 
haben die Fahrradhersteller in den letzten Tagen ihren starken Arm  gezeigt. Doch Herculesse und Gazellen dieser Welt, lasst euch zurufen: 
Auch wenn Hercules für einen Tag keine Fahrräder produziert und Gazelle-Fahrräder ohne Lenkstange gefahren werden, ist das nicht das Ende der 
Mobilität der Menschheit.  Welche Hybris! Lasst euch gesagt sein: Die Mobilität und vor allem die aufrechte Haltung der Welt liegen immer 
noch in den Waden der Menschen. Also,  Großbürger, geht auf die Barrikaden und radelt. Am besten auf einem Hochrad aus dem 19. Jahrhundert! 
Natürlich verändert die  fortschreitende Umrüstung auf flache Fahrräder unsere Gesellschaft. Vieles wird  einfacher. 
Auch dieser Text ist mit Hilfe der Errungenschaften der Velozipedisierung durch einen Fahrradkurier an die Postillon-Redaktion 
geschickt worden. Aber wir sollten uns zu wehren beginnen,  wenn einzelne Menschen auf den vielen Fahrrädern uns unsere 
Lebensentwürfe vorschreiben. Noch ist es dazu nicht zu spät. Wir  dürfen die Gestaltung der Zukunft nicht denen überlassen, 
die sich als radfahrende Avantgarde verstehen und meinen, sie wüssten, was das Beste für  die Masse Mensch auf den Zweirädern sei. 
Mountain-Biker und BMXler sind jedenfalls dabei  der schlechteste Ratgeber. Sie achten den Straßenverlauf des anderen nicht, 
setzen ihre hervorragenden aerodynamischen Eigenschaften nur für den eigenen Vorteil ein, sind darauf bedacht, im Gelände und in 
Halfpipes zu tricksen, was das Zeug hält. Und offensichtlich  sind Narzissmus und Flachfahrradzismus Zwillinge. Natürlich soll niemandem  
verboten werden, auf einem Bonanzarad seine zweite Pubertät zu durchleben. Nur  sollte man das nicht zum politischen Programm erheben. 
Jetzt haben wir  noch die Zeit, diesem Treiben Einhalt zu gebieten. Wir brauchen den Hochradfahrer, dem Werte wie fehlende Gangschaltung, 
schlechte Lenkbarkeit und tiefe, schmerzhafte Stürze auch im Straßenverkehr am  Herzen liegen.
'''
test_case1 = find_similar_articles(text, tfidf_df, uniqueWordsDataframe)

In [None]:
# Top 100 index number from the original corpus
# Output format: Tuple - (Index (in the corpus, cosnine similarity score with the query))

sorted(test_case1.items(), key=lambda kv: kv[1], reverse=True)[:100]

[(0, 1.0),
 (930, 0.047951464424019326),
 (543, 0.04561047227186854),
 (412, 0.04340434931883307),
 (858, 0.04315365367703091),
 (170, 0.04088824815090807),
 (550, 0.03987891944007292),
 (577, 0.03912831513130655),
 (769, 0.038186242830282165),
 (196, 0.03816342878747472),
 (665, 0.03629304840728634),
 (387, 0.03595683918566671),
 (793, 0.035926209598206346),
 (386, 0.035874601830993974),
 (669, 0.03584709287490496),
 (60, 0.035836089712085906),
 (287, 0.03580776594307533),
 (320, 0.035786523135320165),
 (529, 0.035587708128434765),
 (905, 0.035405970762498004),
 (462, 0.03538073048982467),
 (334, 0.03531514201544353),
 (198, 0.03524086693619325),
 (277, 0.03474083695737546),
 (830, 0.03461512460181474),
 (337, 0.03449922074243761),
 (992, 0.03415814870335947),
 (533, 0.03376653727284227),
 (323, 0.03371823295763141),
 (825, 0.033609080334280034),
 (180, 0.033221657707415635),
 (961, 0.03295340669802094),
 (247, 0.03293957837758639),
 (362, 0.03280623474974012),
 (886, 0.03273483338106

In [None]:
# Test Case: 2 - 2nd Document from the corpus

test_case2 = find_similar_articles(' +++ Diskrete Hilfe für Namenlose: Unbekannter gründet Anonyme Anonyme +++ +++ Historiker besorgt: Ausgrabungen zur Tortenschlacht bei Leipzig geraten zur Farce +++ +++ Nutzlos bei Glatteis: Streukäse +++ +++ Hart für Xavier & Co.: Söhne Mannheims zur Adoption freigegeben +++   Jetzt bestellen! Die besten Newsticker als Buch (nur 9,99€): Der Postillon: +++ Newsticker +++ ', tfidf_df, uniqueWordsDataframe)

In [None]:
# Top 100 index number from the original corpus
# Output format: Tuple - (Index (in the corpus, cosnine similarity score with the query))

sorted(test_case2.items(), key=lambda kv: kv[1], reverse=True)[:100]

[(1, 1.0),
 (526, 0.1005405761739281),
 (469, 0.08629510816064836),
 (458, 0.08250278245641793),
 (996, 0.07878818408605438),
 (202, 0.0750357146795267),
 (132, 0.07417466849247337),
 (7, 0.07406874251965759),
 (318, 0.07312790620076733),
 (260, 0.07240009882854205),
 (424, 0.07181018719071532),
 (851, 0.07121684166830046),
 (915, 0.07051695602537196),
 (290, 0.06986691473568833),
 (878, 0.06980847023530368),
 (209, 0.06980102335331688),
 (908, 0.06928418372678231),
 (360, 0.06920231291867567),
 (400, 0.06919029485384369),
 (266, 0.06867758537744086),
 (398, 0.06866004392762715),
 (794, 0.06856937508766361),
 (216, 0.06812622340193207),
 (862, 0.06746331695227518),
 (984, 0.06684926336873057),
 (829, 0.0668038400659891),
 (163, 0.06612769757632841),
 (635, 0.06599185280626026),
 (154, 0.06566904786413254),
 (46, 0.06564451605506993),
 (959, 0.06529334279223144),
 (64, 0.06415646014456021),
 (485, 0.06354015967527882),
 (759, 0.06349752208822318),
 (743, 0.06329207404566477),
 (637, 0.0

In [None]:
# Test Case: 3 - 5th Document from the corpus

test_case3 = find_similar_articles(' +++ Hat ihn fallen gelassen: Trapezkünstler beendet Freundschaft mit Kollegen +++ +++ Vor Gericht gezerrt: Gast verklagt Restaurant wegen Verletzung bei Vorspeise +++ +++ Ausgerechnet im e-π-Zentrum: Erdbeben verwüstet Institut für irrationale Zahlen +++ +++ Nehmen sich nicht viel: Models trotz Buffet alle gleich mager +++ +++ Beförderung: Busfahrer für erfolgreiche Arbeit endlich belohnt +++ +++ Quacksalber: Wunderheiler behandelt Ente mit Zaubercreme +++ +++ Die Passage muss gestrichen werden: Einkaufszentrum wehrt sich gegen Renovierungsforderungen aus der Zeitung +++   Alle Newsticker im Postillon-Archiv: Hier! ', tfidf_df, uniqueWordsDataframe)

In [None]:
# Top 100 index number from the original corpus
# Output format: Tuple - (Index (in the corpus, cosnine similarity score with the query))

sorted(test_case3.items(), key=lambda kv: kv[1], reverse=True)[:100]

[(5, 1.0),
 (333, 0.07180822500709202),
 (70, 0.0622235117797341),
 (371, 0.06112857831590281),
 (526, 0.06003985340570722),
 (655, 0.0577128267787493),
 (378, 0.053346938245086684),
 (88, 0.05312565354281311),
 (570, 0.052958902786126826),
 (435, 0.05256186767549235),
 (690, 0.0503226756142022),
 (907, 0.05018860314217637),
 (312, 0.048589016474379326),
 (370, 0.04784921591619722),
 (984, 0.047471816248939286),
 (833, 0.04605651906324786),
 (939, 0.04605141439267114),
 (677, 0.0456943917734406),
 (496, 0.044314461244057295),
 (78, 0.04415476123439908),
 (713, 0.044144563906680354),
 (660, 0.04391061541338713),
 (782, 0.04220483508771618),
 (352, 0.04159157656460002),
 (989, 0.04132195790553874),
 (642, 0.04094592628575958),
 (311, 0.04090691704387496),
 (715, 0.040884077006779596),
 (775, 0.04010383423192159),
 (822, 0.04002934609675735),
 (220, 0.03979504695026026),
 (743, 0.039758522916380275),
 (400, 0.03912735999386268),
 (646, 0.038900109643459975),
 (817, 0.03833437200648335),
 