In [24]:
import pandas as pd
import requests
from keys import API_KEY # my api key is stored in a separate file

In [25]:
# get all articles about music from the last 30 days
url = ('https://newsapi.org/v2/everything?'
       'q=music&'
       'from=2023-04-09&'
       'sortBy=popularity&'
       'apiKey=' + API_KEY)
response = requests.get(url)
json_response = response.json()
articles = json_response['articles']

In [26]:
# convert to dataframe
df = pd.DataFrame(articles)
df.head()

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
0,"{'id': 'engadget', 'name': 'Engadget'}",Igor Bonifacic,YouTube Music contractors vote overwhelmingly ...,"On Wednesday, a group of contractors at YouTub...",https://www.engadget.com/youtube-music-contrac...,https://s.yimg.com/uu/api/res/1.2/sBsdBENx0Xc8...,2023-04-26T21:38:44Z,"On Wednesday, a group of contractors at YouTub..."
1,"{'id': 'engadget', 'name': 'Engadget'}",Igor Bonifacic,Copyright in spotlight after streaming platfor...,If you spent almost any time on the internet t...,https://www.engadget.com/copyright-in-spotligh...,https://s.yimg.com/uu/api/res/1.2/roneRM7jZ5Gc...,2023-04-19T18:35:13Z,If you spent almost any time on the internet t...
2,"{'id': 'engadget', 'name': 'Engadget'}",Igor Bonifacic,Lofi Girl is back and she made a synthwave friend,"After a day-long absence, Lofi Girl has return...",https://www.engadget.com/lofi-girl-is-back-and...,https://s.yimg.com/uu/api/res/1.2/SDw4A9suayQJ...,2023-04-11T18:29:28Z,"After a day-long absence, Lofi Girl has return..."
3,"{'id': 'engadget', 'name': 'Engadget'}",Kris Holt,The Mario theme joins your old tweets in the L...,The Library of Congress\r\n has announced the ...,https://www.engadget.com/the-mario-theme-joins...,https://s.yimg.com/uu/api/res/1.2/Y7LpvkEwBGQY...,2023-04-12T17:05:02Z,The Library of Congress\r\n has announced the ...
4,"{'id': 'engadget', 'name': 'Engadget'}",Kris Holt,Spotify will shut down 'Heardle' on May 5th,Spotify has some disappointing news for Heardl...,https://www.engadget.com/spotify-will-shut-dow...,https://s.yimg.com/uu/api/res/1.2/QeXnzzAU2GBm...,2023-04-14T15:14:54Z,Spotify has some disappointing news for Heardl...


In [27]:
# convert source column to just the name of the source
if 'source' in df.columns:
    df['source'] = df['source'].apply(lambda x: x['name'])
df.head()

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
0,Engadget,Igor Bonifacic,YouTube Music contractors vote overwhelmingly ...,"On Wednesday, a group of contractors at YouTub...",https://www.engadget.com/youtube-music-contrac...,https://s.yimg.com/uu/api/res/1.2/sBsdBENx0Xc8...,2023-04-26T21:38:44Z,"On Wednesday, a group of contractors at YouTub..."
1,Engadget,Igor Bonifacic,Copyright in spotlight after streaming platfor...,If you spent almost any time on the internet t...,https://www.engadget.com/copyright-in-spotligh...,https://s.yimg.com/uu/api/res/1.2/roneRM7jZ5Gc...,2023-04-19T18:35:13Z,If you spent almost any time on the internet t...
2,Engadget,Igor Bonifacic,Lofi Girl is back and she made a synthwave friend,"After a day-long absence, Lofi Girl has return...",https://www.engadget.com/lofi-girl-is-back-and...,https://s.yimg.com/uu/api/res/1.2/SDw4A9suayQJ...,2023-04-11T18:29:28Z,"After a day-long absence, Lofi Girl has return..."
3,Engadget,Kris Holt,The Mario theme joins your old tweets in the L...,The Library of Congress\r\n has announced the ...,https://www.engadget.com/the-mario-theme-joins...,https://s.yimg.com/uu/api/res/1.2/Y7LpvkEwBGQY...,2023-04-12T17:05:02Z,The Library of Congress\r\n has announced the ...
4,Engadget,Kris Holt,Spotify will shut down 'Heardle' on May 5th,Spotify has some disappointing news for Heardl...,https://www.engadget.com/spotify-will-shut-dow...,https://s.yimg.com/uu/api/res/1.2/QeXnzzAU2GBm...,2023-04-14T15:14:54Z,Spotify has some disappointing news for Heardl...


In [28]:
# clean the title column
import re

def clean(title):
    return re.sub(r'[^a-zA-Z0-9 ]', '', title)

df['title'] = df['title'].apply(clean)
df.head()

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
0,Engadget,Igor Bonifacic,YouTube Music contractors vote overwhelmingly ...,"On Wednesday, a group of contractors at YouTub...",https://www.engadget.com/youtube-music-contrac...,https://s.yimg.com/uu/api/res/1.2/sBsdBENx0Xc8...,2023-04-26T21:38:44Z,"On Wednesday, a group of contractors at YouTub..."
1,Engadget,Igor Bonifacic,Copyright in spotlight after streaming platfor...,If you spent almost any time on the internet t...,https://www.engadget.com/copyright-in-spotligh...,https://s.yimg.com/uu/api/res/1.2/roneRM7jZ5Gc...,2023-04-19T18:35:13Z,If you spent almost any time on the internet t...
2,Engadget,Igor Bonifacic,Lofi Girl is back and she made a synthwave friend,"After a day-long absence, Lofi Girl has return...",https://www.engadget.com/lofi-girl-is-back-and...,https://s.yimg.com/uu/api/res/1.2/SDw4A9suayQJ...,2023-04-11T18:29:28Z,"After a day-long absence, Lofi Girl has return..."
3,Engadget,Kris Holt,The Mario theme joins your old tweets in the L...,The Library of Congress\r\n has announced the ...,https://www.engadget.com/the-mario-theme-joins...,https://s.yimg.com/uu/api/res/1.2/Y7LpvkEwBGQY...,2023-04-12T17:05:02Z,The Library of Congress\r\n has announced the ...
4,Engadget,Kris Holt,Spotify will shut down Heardle on May 5th,Spotify has some disappointing news for Heardl...,https://www.engadget.com/spotify-will-shut-dow...,https://s.yimg.com/uu/api/res/1.2/QeXnzzAU2GBm...,2023-04-14T15:14:54Z,Spotify has some disappointing news for Heardl...


In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create tfidf vectorized matrix
vectMatrix = TfidfVectorizer(ngram_range=(1, 2))
tfidf = vectMatrix.fit_transform(df['title'])

In [41]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean(title)
    title_vect = vectMatrix.transform([title])
    similarity = cosine_similarity(title_vect, tfidf).flatten()
    top5 = np.argsort(similarity)[-5:][::-1]
    results = df.iloc[top5]
    return results

array([0.14075371, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.16090068, 0.14435768, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.11078928, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.12849462, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.14357152, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.12812

In [44]:
# test it out and display results
import ipywidgets as widgets
from IPython.display import display

article_input = widgets.Text(
    value='Youtube music',
    description='Article:',
    disabled=False
)

article_list = widgets.Output()

def on_type(change):
    article_list.clear_output()
    with article_list:
        title = change['new']
        if len(title) > 5:
            display(search(title))

article_input.observe(on_type, names='value')

display(article_input, article_list)

Text(value='Youtube music', description='Article:')

Output()