In [1]:
from huggingface_hub import hf_hub_download

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df_articles = pd.read_csv(
  hf_hub_download("fabiochiu/medium-articles", repo_type="dataset", filename="medium_articles.csv")
)

df_articles= df_articles[:10000]

df_articles.head()


Unnamed: 0,title,text,url,authors,timestamp,tags
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci..."
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P..."
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We..."
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P..."
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology..."


In [3]:
#Apply Vectorizer to the corpus
corpus = df_articles['text'].astype(str)
vectorizer = TfidfVectorizer()
corpus_vectorized = vectorizer.fit_transform(corpus)
print(corpus_vectorized.shape)

(10000, 110038)


In [4]:
#vectorize query
query = 'data science nlp'
query_vectorized = vectorizer.transform([query])
print(query_vectorized.shape)

(1, 110038)


In [5]:
#compute similarity between query and the articles

scores = query_vectorized.dot(corpus_vectorized.transpose())
scores_array = scores.toarray()[0]
print(scores_array.shape)

(10000,)


In [6]:
#retrive top n_indices with the highest scores

def show_best_results (df_articles, scores_array, top_n=10):
    sorted_indices = scores_array.argsort() [::-1]
    for position, idx in enumerate(sorted_indices[:top_n]):
        row = df_articles.iloc[idx]
        title = row["title"]
        score = scores_array[idx]
        print(f"{position + 1} [score = {score}]: {title}")

show_best_results(df_articles, scores_array)



1 [score = 0.5913069114145734]: What in the “Hello World” is Natural Language Processing (NLP)?
2 [score = 0.47487715081627846]: The Story of how Natural Language Processing is changing Financial Services in 2020
3 [score = 0.3672260843689108]: The Application of Natural Language Processing in OpenSearch
4 [score = 0.3483482100035714]: 5 Steps to Become a Data Scientist
5 [score = 0.3413479210936063]: Data Science Scholarships-Full-list Compilations.
6 [score = 0.3139018781861753]: Data science… without any data?!
7 [score = 0.3106738813439215]: Transform your Data Science Projects with these 5 Steps of Design Thinking
8 [score = 0.29735216501354833]: The Top Online Data Science Courses for 2019
9 [score = 0.2820392959961161]: How bad data is weakening the study of big data
10 [score = 0.27787107649790765]: I ranked every Intro to Data Science course on the internet, based on thousands of data points


In [7]:
#using query containg stopwords
query = 'how to learn the data science'
query_vectorized = vectorizer.transform([query])
scores= query_vectorized.dot(corpus_vectorized.transpose())
scores_array = scores.toarray()[0]
show_best_results(df_articles, scores_array)

1 [score = 0.5287112127807346]: 5 Steps to Become a Data Scientist
2 [score = 0.506478132245523]: Data science… without any data?!
3 [score = 0.5060665245289075]: The Top Online Data Science Courses for 2019
4 [score = 0.5012839107561229]: Data Science, the Good, the Bad, and the… Future
5 [score = 0.4868361572835144]: Data Science Scholarships-Full-list Compilations.
6 [score = 0.48000221277779337]: Roadmap to Becoming a Successful Data Scientist
7 [score = 0.4749279770810061]: A Layman’s Guide to Data Science: How to Become a (Good) Data Scientist
8 [score = 0.4737720457472213]: Transform your Data Science Projects with these 5 Steps of Design Thinking
9 [score = 0.4696109348440433]: Why is Data Science Losing Its Charm?
10 [score = 0.4627538814657971]: I ranked every Intro to Data Science course on the internet, based on thousands of data points
