In [1]:
#pip install flask

In [2]:
#pip install yake

In [3]:
from flask import Flask, jsonify,abort
import requests
from bs4 import BeautifulSoup
import yake

In [4]:
app = Flask(__name__)

In [5]:
#Funtion used to scrape publication data from Google Deepmind
def scrape_deepmind_publications():
    url = 'https://deepmind.com/research/publications/'
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the unordered list with publications
        publication_list = soup.find('ul', {'class': 'list-compact'})
        
        # Each publication is in a list item within the unordered list
        publications = publication_list.find_all('li', recursive=False)
        
        publication_data = []
        
        for publication in publications:
            # Extract date, title, authors, and venue
            date = publication.find('time').text.strip()
            title = publication.find('span', {'class': 'list-compact__inner'}).text.strip()
            authors = publication.find('dd', {'class': 'glue-caption'}).text.strip()
            venue = publication.find_all('dd', {'class': 'glue-caption'})[1].text.strip()
            
            publication_data.append({
                'date': date,
                'title': title,
                'authors': authors,
                'venue': venue
            })
        
        return publication_data
    
    else:
        print('Failed to retrieve the webpage')
        return []

# Call the function and print results
publications = scrape_deepmind_publications()
for pub in publications:
    print(pub)


{'date': '17 Jun 24\n17 June 2024', 'title': 'Neural Fields as Distributions: Signal Processing Beyond Euclidean Space', 'authors': 'Daniel Rebain, Soroosh Yazdani, Kwang Moo Yi, Andrea Tagliasacchi', 'venue': 'CVPR 2024'}
{'date': '17 Jun 24\n17 June 2024', 'title': 'Mirasol3B: A Multimodal Autoregressive Model for Time-Aligned and Contextual Modalities', 'authors': 'AJ Piergiovanni, Isaac Noble, Dahun Kim, Michael S. Ryoo, Victor Gomes, Anelia Angelova', 'venue': 'CVPR 2024'}
{'date': '17 Jun 24\n17 June 2024', 'title': "Bayes' Rays: uncertainty quantification for neural radiance fields", 'authors': 'Lily Goli, Cody Reading, Silvia Sellan, Alec Jacobson, Andrea Tagliasacchi', 'venue': 'CVPR 2024'}
{'date': '7 May 24\n7 May 2024', 'title': 'π2vec: Policy Representations with SuccessorFeatures', 'authors': 'Gianluca Scarpellini*, Ksenia Konyushkova, Claudio Fantacci, Tom Paine, Yutian Chen, Misha Denil', 'venue': 'ICLR 2024'}
{'date': '7 May 24\n7 May 2024', 'title': 'Kalman Filter for

In [6]:
# homepage endpoint
@app.route('/')
def home():
    return "Welcome to the DeepMind News API!"


In [7]:
# Endpoint to fetch a list of articles
@app.route('/get_data', methods=['GET'])
def get_data():
    articles = scrape_deepmind_publications()
    # Extract just the titles from the first 5 articles
    titles = [article['title'] for article in articles[:5]]
    return jsonify(titles)

In [8]:
# Endpoint to display information about articles
@app.route('/articles', methods=['GET'])
def get_articles():
    articles = scrape_deepmind_publications()
    # Transform articles for display, excluding content
    display_articles = [{'number': i+1, 'title': article['title'], 'date': article['date'], 'authors': article['authors'], 'venue': article['venue']} for i, article in enumerate(articles)]
    return jsonify(display_articles)

In [9]:
# Endpoint to access the content of a specified article
@app.route('/article/<int:number>', methods=['GET'])
def get_article(number):
    articles = scrape_deepmind_publications()
    # Transform articles for display, excluding content
    display_articles = [{'number': i+1, 'title': article['title'], 'date': article['date'], 'authors': article['authors'], 'venue': article['venue']} for i, article in enumerate(articles)]
    article = next((item for item in display_articles if item['number'] == number), None)
    if article is not None:
        return jsonify(article)
    else:
        abort(404, description="Resource not found")


In [10]:
#Keyword extraction ML to identify key topics from article titles
def keyword_extraction(text, num_keywords=5):
    """
    Extract keywords from the given text using YAKE keyword extraction.
    
    Parameters:
    text (str): The input text to analyze.
    num_keywords (int): The number of top keywords to extract.
    
    Returns:
    list: A list of keywords.
    """
    kw_extractor = yake.KeywordExtractor()
    keywords = kw_extractor.extract_keywords(text)
    sorted_keywords = sorted(keywords, key=lambda x: x[1])[:num_keywords]  # Sort by score and pick the top keywords
    return [kw[0] for kw in sorted_keywords]  # Return only the keywords, not the scores


In [11]:
# Endpoint for machine learning analysis on all articles or a single one
@app.route('/ml', defaults={'number': None})
@app.route('/ml/<int:number>', methods=['GET'])
def ml_analysis(number):
    articles = scrape_deepmind_publications()
    display_articles = [{'number': i+1, 'title': article['title'], 'date': article['date'], 'authors': article['authors'], 'venue': article['venue']} for i, article in enumerate(articles)]
    if number is None:
        # Apply ML function (keyword extraction) to all articles
        results = {article['number']: keyword_extraction(article['title']) for article in display_articles}
    else:
        # Apply ML function to the specific article
        article = next((item for item in display_articles if item['number'] == number), None)
        if article is not None:
            results = {number: keyword_extraction(article['title'])}
        else:
            abort(404, description="Resource not found")
    return jsonify(results)

In [12]:
#from threading import Thread

#def run_app():
    #app.run(port=5000)

##flask_thread = Thread(target=run_app)
#flask_thread.start()


In [13]:
if __name__ == '__main__':
    app.run(debug=False,port=5001)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5001/ (Press CTRL+C to quit)
127.0.0.1 - - [05/Apr/2024 18:08:43] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [05/Apr/2024 18:08:56] "[37mGET /get_data HTTP/1.1[0m" 200 -
127.0.0.1 - - [05/Apr/2024 18:09:37] "[37mGET /articles HTTP/1.1[0m" 200 -
127.0.0.1 - - [05/Apr/2024 18:09:48] "[37mGET /article/1 HTTP/1.1[0m" 200 -
127.0.0.1 - - [05/Apr/2024 18:10:00] "[37mGET /ml HTTP/1.1[0m" 200 -
127.0.0.1 - - [05/Apr/2024 18:10:11] "[37mGET /ml/2 HTTP/1.1[0m" 200 -
