# AI Enhanced News Search Utility

In [1]:
import gradio                       as gr
import pandas                       as pd

import ArticleImportUtils           as AIU
import DisplayArticlesUtilities     as DAU
import ArticleTranslationUtility    as ATU

from datetime                       import datetime
from transformers                   import MBartForConditionalGeneration, MBart50TokenizerFast




### Initialize Variables

In [4]:
html_css = """
<style>
    table tr:hover {
        background-color: #f5f5f5; /* Light grey background on hover #f0f0f0*/
    }
    .selected {
        background-color: #dodgerblue; /* Color for a selected row #a3d2ca*/
    }
</style>

"""

js = """
<script>
    //function updateDetails(index) {
    //    const details = {top_headlines['url'].to_json()};
    //    document.getElementById('details').innerText = details[index];
    //}

    var prevRow = null; // Track the previously clicked row
    function selectRow(row) {
        if (prevRow) {
            prevRow.classList.remove('selected');   // Remove the background from the previous row
        }
        row.classList.add('selected');              // Add background to the current row
        prevRow = row;                              // Update the previously clicked row
    }
</script>
"""

In [5]:
country_choice  = [
    #  "ae"
    #,
      ("Argentina"      , "ar")
    #, "at"
    , ("Australia"      , "au")
    #, "be"
    #, "bg"
    , ("Brazil"         , "br")
    , ("Canada"         , "ca")
    , ("China"          , "zh")
    #, "ch"
    #, "cn"
    #, "co"
    #, "cu"
    #, "cz"
    #, "eg"
    , ("France"         , "fr")
    , ("Germany"        , "de")
    #, "gr"
    #, "hk"
    #, "hu"
    #, "id"
    #, "il"
    , ("India"          , "in")
    , ("Ireland"        , "ie")
    , ("Israel"         , "is")
    , ("Italy"          , "it")
    #, ("Japan"          , "jp")
    #, "kr"
    #, "lt"
    #, "lv"
    #, "ma"
    #, ("Mexico"         , "mx")
    #, "my"
    #, "ng"
    , ("Netherlands"    , "nl")
    , ("Norway"         , "no")
    #, "nz"
    #, "ph"
    , ("Pakistan"       , "pk")
    #, "pl"
    #, "pt"
    #, "ro"
    #, "rs"
    , ("Russia"         , "ru")
    , ("Saudi Arabia"   , "sa")
    , ("South Africa"   , "za")
    , ("Sweden"         , "se")
    #, "sg"
    #, "si"
    #, "sk"
    #, "th"
    #, "tr"
    #, "tw"
    #, "ua"
    , ("United Kingdom" , "gb")
    , ("United States"  , "us")
    #, "ve"

]

language_choice = [
      ("Arabic"     , "ar")
    , ("Chinese"    , "zh")
    , ("Dutch"      , "nl")
    , ("English"    , "en")
    , ("French"     , "fr")
    , ("German"     , "de")
    , ("Hebrew"     , "he")
    , ("Italian"    , "it")
    , ("Norwegian"  , "no")
    , ("Portugese"  , "pt")
    , ("Russian"    , "ru")
    , ("Spanish"    , "es")
    , ("Swedish"    , "sv")
    , ("Urdu"       , "ud")
  ]

#pt = Portugese
#sv = Swedish

### Global Functions

In [6]:
def get_list_of_articles(keyword = None, countries = []) :
    news        = AIU.get_article_data(country = countries, language = None, query = keyword)
    articles    = DAU.CreateArticleList(news)
    return pd.json_normalize(articles)

# *Test Code*

In [22]:
news        = AIU.get_article_data(country = ["us", "fr"], language = None, query = "israel")
articles    = DAU.CreateArticleList(news)
display(pd.json_normalize(articles))
articles_df = pd.json_normalize(news["news"])

#display(news)
#articles = DAU.CreateArticleList(news)
#display(pd.json_normalize(articles).head())
display(articles_df.head())

Getting article data for query: israel
{'text': 'israel', 'source-countries': ['us', 'fr']}


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Token indices sequence length is longer than the specified maximum sequence length for this model (6619 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,title,author,content,url,published_date,keywords,image,sentiment.label,sentiment.score
0,"As Israel-Hamas war reaches 100-day mark, here...",[JULIA FRANKEL],Menu Menu World U.S. Election 2024 Politics Sp...,https://apnews.com/article/war-gaza-israel-ham...,2024-01-14 02:06:53,"[menu, menu, world, u, ., s, ., election, 202,...",https://dims.apnews.com/dims4/default/63abb09/...,NEGATIVE,0.992899
1,Israel pushes deeper into Gaza as UN estimates...,[Greg Wehner],46Posts Sort by NewestSort by Oldest Back to T...,https://www.foxnews.com/live-news/november-8-i...,2023-11-09 05:57:38,"[46, ##post, ##s, sort, by, newest, ##sor, ##t...",https://static.foxnews.com/static/orion/styles...,NEGATIVE,0.989961
2,"Israel, Hamas trade blame over deadly hospital...",[JOSEPH EID],30Posts Sort by NewestSort by Oldest Back to T...,https://www.foxnews.com/live-news/october-17-2...,2023-10-18 04:40:34,"[30, ##post, ##s, sort, by, newest, ##sor, ##t...",https://static.foxnews.com/static/orion/styles...,NEGATIVE,0.996981
3,"100 days of agony, captured in photos of the I...",[ODED BALILTY],Menu Menu World U.S. Election 2024 Politics Sp...,https://apnews.com/article/israel-palestinians...,2024-01-13 11:06:52,"[menu, menu, world, u, ., s, ., election, 202,...",https://dims.apnews.com/dims4/default/c7e9b30/...,NEGATIVE,0.931587
4,A timeline of Israel-Palestine peace negotiations,[Nicole Narea],Even though Israel has approved a temporary ce...,https://www.vox.com/world-politics/2023/11/22/...,2023-11-22 13:00:00,"[even, though, israel, has, approved, a, tempo...",https://cdn.vox-cdn.com/thumbor/ntPGosDUvq9KOD...,POSITIVE,0.999163
5,Israel-Hamas war live updates: IDF is preparin...,"[Adam Jeffery, Amanda Macias, CNBC, CNBC and C...",This is CNBC's live blog tracking developments...,https://www.nbcchicago.com/news/business/money...,2023-10-14 23:21:33,"[this, is, cn, ##bc, ', s, live, blog, trackin...",https://media.nbcchicago.com/2023/10/107317425...,NEGATIVE,0.78085
6,Israel-Hamas war: Israel to reopen key Gaza bo...,[The Associated Press],Amid widespread outrage over a strike that kil...,https://apnews.com/article/israel-hamas-war-la...,2024-04-05 20:40:00,"[amid, widespread, outrage, over, a, strike, t...",https://dims.apnews.com/dims4/default/063923d/...,NEGATIVE,0.990209
7,Israel’s Strategic Challenge,[Blaise Misztal],"For nearly two decades, Israel has eschewed ma...",https://warontherocks.com/2023/10/israels-stra...,2023-10-30 08:45:27,"[for, nearly, two, decades, ,, israel, has, es...",https://warontherocks.com/wp-content/uploads/2...,POSITIVE,0.985119
8,Israel-Hamas war live updates: Israeli forces ...,"[CNBC, CNBC and Joanna Tan, Natasha Turak, Reb...",This is CNBC's live blog tracking developments...,https://www.nbcchicago.com/news/business/money...,2023-10-28 22:28:36,"[this, is, cn, ##bc, ', s, live, blog, trackin...",https://media.nbcchicago.com/2023/10/107324944...,NEGATIVE,0.58879
9,Live updates: Gaza Health Ministry asks anyone...,[],Menu Menu U.S. World Politics Video Spotlight ...,https://apnews.com/live/israel-hamas-war-news-...,2023-10-20 02:13:34,"[menu, menu, u, ., s, ., world, politics, vide...",https://dims.apnews.com/dims4/default/87c813f/...,NEGATIVE,0.990899


Unnamed: 0,title,author,content,url,published_date,keywords,image,sentiment.label,sentiment.score
0,"As Israel-Hamas war reaches 100-day mark, here...",[JULIA FRANKEL],Menu Menu World U.S. Election 2024 Politics Sp...,https://apnews.com/article/war-gaza-israel-ham...,2024-01-14 02:06:53,"[menu, menu, world, u, ., s, ., election, 202,...",https://dims.apnews.com/dims4/default/63abb09/...,NEGATIVE,0.992899
1,Israel pushes deeper into Gaza as UN estimates...,[Greg Wehner],46Posts Sort by NewestSort by Oldest Back to T...,https://www.foxnews.com/live-news/november-8-i...,2023-11-09 05:57:38,"[46, ##post, ##s, sort, by, newest, ##sor, ##t...",https://static.foxnews.com/static/orion/styles...,NEGATIVE,0.989961
2,"Israel, Hamas trade blame over deadly hospital...",[JOSEPH EID],30Posts Sort by NewestSort by Oldest Back to T...,https://www.foxnews.com/live-news/october-17-2...,2023-10-18 04:40:34,"[30, ##post, ##s, sort, by, newest, ##sor, ##t...",https://static.foxnews.com/static/orion/styles...,NEGATIVE,0.996981
3,"100 days of agony, captured in photos of the I...",[ODED BALILTY],Menu Menu World U.S. Election 2024 Politics Sp...,https://apnews.com/article/israel-palestinians...,2024-01-13 11:06:52,"[menu, menu, world, u, ., s, ., election, 202,...",https://dims.apnews.com/dims4/default/c7e9b30/...,NEGATIVE,0.931587
4,A timeline of Israel-Palestine peace negotiations,[Nicole Narea],Even though Israel has approved a temporary ce...,https://www.vox.com/world-politics/2023/11/22/...,2023-11-22 13:00:00,"[even, though, israel, has, approved, a, tempo...",https://cdn.vox-cdn.com/thumbor/ntPGosDUvq9KOD...,POSITIVE,0.999163


### Gradio : Layout / Subroutines / Application

In [7]:
# Create Layout of Gradio form
with gr.Blocks(fill_height=True, theme="") as app: #Interstellar #Base #dracula_revamped #storj_theme #minimalist #calm_seafoam
    with gr.Accordion("Parameters", open=True) as srch_parameters:
        with gr.Row() :
            text_keyword        = gr.Textbox(
                label           = "Search Term"
              , placeholder     = "Enter your search term here"
            )
            #text_language       = gr.Dropdown(
            #    choices         = language_choice
            #  , value           = "en"
            #  , interactive     = True
            #  , label           = "Output Language"
            #)

        with gr.Row() :
            text_source         = gr.CheckboxGroup(
                choices         = country_choice
              , label           = "Source Country (Leave unchecked for ALL)"
            )

        submit_btn = gr.Button("Search")
 
    with gr.Accordion("Search Results", open=False, visible=False) as srch_results:
        with gr.Row() :
            with gr.Column(scale=2) :  
                with gr.Tab("Article List") :
                    html_results    = gr.HTML()

                with gr.Tab("HTML Source") :
                    html_code       = gr.TextArea()

                with gr.Tab("Data") :
                    article_list    = gr.Dataframe()
    
            with gr.Column(scale=1) :
                article_id          = gr.Number(
                      label         = "Article Number to Retrieve:"
                    , minimum       = 0
                    , precision     = 0
                )
                text_language       = gr.Dropdown(
                      choices       = language_choice
                    , value         = "en"
                    , interactive   = True
                    , label         = "Output Language"
                )
                retrieve_btn = gr.Button("Retrieve Article")#, size="sm")
                recommendation_btn = gr.Button("Get Additional Recommendations")#, size="sm")

    with gr.Accordion("Additional Reading", open=False, visible=False) as recommendations:
        with gr.Row() :
            with gr.Column(scale=2) :  
                with gr.Tab("Article List") :
                    html_results    = gr.HTML()

                with gr.Tab("HTML Source") :
                    html_code       = gr.TextArea()

                with gr.Tab("Data") :
                    article_list    = gr.Dataframe()
    
            with gr.Column(scale=1) :
                article_id          = gr.Number(
                      label         = "Article Number to Retrieve:"
                    , minimum       = 0
                    , precision     = 0
                )
                text_language       = gr.Dropdown(
                      choices       = language_choice
                    , value         = "en"
                    , interactive   = True
                    , label         = "Output Language"
                )
                retrieve_more_btn = gr.Button("Retrieve Article")#, size="sm")  

    with gr.Accordion("Article", open=False, visible=False) as srch_article:
        with gr.Row() :
            html_article    = gr.TextArea(
                  elem_id   = "Chosen_Article"
                , label     = ""
                #, visible   = False
            )

    # Search button function
    ## This function needs to be embedded in the Gradio Block 
    ## in order to control the Gradio Controls in the form
    def submit_search(keyword, countries=[]) :
        global global_articles_df

        # Get the top headlines for the country and language code and search term
        if len(countries) == 0 :           
            headlines = get_list_of_articles(keyword=keyword)

        else: #len(countries) == 0 :
            headlines = get_list_of_articles(keyword=keyword, countries=countries)

        headlines.fillna("", inplace=True)
        headlines.replace("[Removed]", None, inplace=True)
        headlines = headlines.dropna()
        #headlines["published_date"] = headlines["published_date"].astype(str)
        #headlines["published_date"] = \
        #    pd.to_datetime(headlines["published_date"], format='ISO8601')

        headlines.sort_values(
              by        = "published_date"
            , axis      = 0
            , ascending = False
            , inplace   = True
        )
        headlines = headlines.reset_index(drop=True)

        html_content    = html_css
        html_content    += "<table>\n"
        for index, article in headlines.iterrows() :
            title           = article["title"]
            author          = article["author"]
            publish_date    = article["published_date"]
            source          = article["url"]
            sentiment       = article["sentiment.label"]
            image           = article["image"]
            #descr           = article["description"]

            html_content    += f'\t<tr id="article_{index}" onclick="selectRow(this)"><td>\n'
            html_content    += f'\t<table><tr><td>'
            if image.strip() :
                html_content += f'<img src={image} style="height:100px;object-fit:cover;" />'

            html_content    += '</td><td>'
            html_content    += f"\t\t<h3>{index+1}. {title}</h3>\n"
            html_content    += "<div>"

            if publish_date.strip():  
                publish_date = datetime.fromisoformat(publish_date.replace('Z', ''))
                
            html_content += f"\t\t{publish_date.strftime('%m/%d/%Y  %I:%M:%S %p')}\n"
            html_content    += "</div>"
            
            html_content    += f'<div style="float: right;">Sentiment: {sentiment}</div><br />'
            if len(author) > 0:  
                html_content += f"\t\t{author}<br />\n"

            html_content    += f"\t\t{source}\n"
            html_content    += "\t\t</td></tr>\n"
            html_content    += "\t</table>"
            html_content    += "\t</td></tr>\n"

        html_content        += "</table>"
        params              = f"Parameters  (Search Term: {keyword}, Source Counties: {str(countries).upper})" #Language: {language}, 

        return {
              srch_parameters   : gr.Accordion(open=False   , label=params)
            , srch_results      : gr.Accordion(open=True    , visible=True)
            , html_results      : html_content + js
            , html_code         : html_content + js
            , article_list      : headlines
        }
    def get_recommendations(keyword)
        
    # Retrieve function
    ## This function needs to be embedded in the Gradio Block 
    ## in order to control the Gradio Controls in the form    
    def retrieve_article(id, articles_df, language=None) :
        
        # Prepare translation model and tokenizer
        #model_name          = "facebook/mbart-large-50-many-to-many-mmt"
        #model               = MBartForConditionalGeneration.from_pretrained(model_name)
        #tokenizer           = MBart50TokenizerFast.from_pretrained(model_name, max_length=512)
        #
        #tokenizer.src_lang  = language + "_" + language.upper
  
        # Retrieve article headline and translate
        #article_headline = ATU.TranslateArticleText(
        #      articles_df.iloc[id-1]["title"]
        #    , language
        #    , model
        #)

        ## Retrieve article text and translate
        #article_text = ATU.TranslateArticleText(
        #      articles_df.iloc[id-1]["content"]
        #    , language
        #    , model
        #)

        article_headline    = articles_df.iloc[id-1]["title"]
        article_text        = articles_df.iloc[id-1]["content"]

        # Create label for article display
        params = f"Article: {id-1}. {article_headline}"

        return {
              srch_parameters   : gr.Accordion(open=False)
            , srch_results      : gr.Accordion(open=False)
            , srch_article      : gr.Accordion(open=True, visible=True)
            , html_article      : gr.TextArea(value=article_text, label=params)
        }    
    
    submit_btn.click(
        fn        = submit_search
      , inputs    = [text_keyword, text_source]
      , outputs   = [srch_parameters, srch_results, html_results, html_code, article_list]
    )

    retrieve_btn.click(
        fn        = retrieve_article
      , inputs    = [article_id, article_list, text_language]
      , outputs   = [srch_parameters, srch_results, srch_article, html_article]
    )

app.launch(show_error=True)




Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




Getting article data for query: Artificial Intelligence
{'text': 'Artificial Intelligence', 'source-countries': ['us'], 'earliest-publish-date': datetime.datetime(2024, 4, 15, 14, 20, 22, 595905)}


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
