# AI Enhanced News Search Utility

In [15]:
import gradio                       as gr
import pandas                       as pd
import matplotlib.pyplot            as plt

import ArticleImportUtils           as AIU
import DisplayArticlesUtilities     as DAU
import ArticleRecommendation        as ARU
import ArticleTranslationUtility    as ATU

from datetime                       import datetime
from transformers                   import MBartForConditionalGeneration, MBart50TokenizerFast

### Initialize Variables

In [7]:
html_css = """
<style>
    table tr:hover {
        background-color: #f5f5f5; /* Light grey background on hover #f0f0f0*/
    }
</style>

"""

In [3]:
country_choice  = [
    #  "ae"
    #,
      ("Argentina"      , "ar")
    #, "at"
    , ("Australia"      , "au")
    #, "be"
    #, "bg"
    , ("Brazil"         , "br")
    , ("Canada"         , "ca")
    , ("China"          , "zh")
    #, "ch"
    #, "cn"
    #, "co"
    #, "cu"
    #, "cz"
    #, "eg"
    , ("France"         , "fr")
    , ("Germany"        , "de")
    #, "gr"
    #, "hk"
    #, "hu"
    #, "id"
    #, "il"
    , ("India"          , "in")
    , ("Ireland"        , "ie")
    , ("Israel"         , "is")
    , ("Italy"          , "it")
    #, ("Japan"          , "jp")
    #, "kr"
    #, "lt"
    #, "lv"
    #, "ma"
    #, ("Mexico"         , "mx")
    #, "my"
    #, "ng"
    , ("Netherlands"    , "nl")
    , ("Norway"         , "no")
    #, "nz"
    #, "ph"
    , ("Pakistan"       , "pk")
    #, "pl"
    #, "pt"
    #, "ro"
    #, "rs"
    , ("Russia"         , "ru")
    , ("Saudi Arabia"   , "sa")
    , ("South Africa"   , "za")
    , ("Sweden"         , "se")
    #, "sg"
    #, "si"
    #, "sk"
    #, "th"
    #, "tr"
    #, "tw"
    #, "ua"
    , ("United Kingdom" , "gb")
    , ("United States"  , "us")
    #, "ve"

]

language_choice = [
      ("Arabic"     , "ar")
    , ("Chinese"    , "zh")
    , ("Dutch"      , "nl")
    , ("English"    , "en")
    , ("French"     , "fr")
    , ("German"     , "de")
    , ("Hebrew"     , "he")
    , ("Italian"    , "it")
    , ("Norwegian"  , "no")
    , ("Portugese"  , "pt")
    , ("Russian"    , "ru")
    , ("Spanish"    , "es")
    , ("Swedish"    , "sv")
    , ("Urdu"       , "ud")
  ]

#pt = Portugese
#sv = Swedish

### Global Functions

In [4]:
def get_list_of_articles(keyword = None, countries = []) :
    news        = AIU.get_article_data(country = countries, language = None, query = keyword)
    articles    = pd.json_normalize(DAU.CreateArticleList(news))

    articles.fillna("", inplace=True)
    articles.replace("[Removed]", None, inplace=True)
    articles = articles.dropna()
    articles.sort_values(
          by        = "published_date"
        , axis      = 0
        , ascending = False
        , inplace   = True
    )
    articles = articles.reset_index(drop=True)

    return articles

def get_list_of_recommendations(keyword) :
    news        = ARU.get_article_recommendations(keyword)
    articles    = pd.json_normalize(DAU.CreateArticleList(news))

    articles.fillna("", inplace=True)
    articles.replace("[Removed]", None, inplace=True)
    articles = articles.dropna()
    articles.sort_values(
          by        = "published_date"
        , axis      = 0
        , ascending = False
        , inplace   = True
    )
    articles = articles.reset_index(drop=True)

    return articles

# Generates a list formatted in HTML
def generate_html_list(article_list_df) :
    html_content    = html_css
    html_content    += "<table>\n"
    for index, article in article_list_df.iterrows() :
        title           = article["title"]
        author          = article["author"]
        publish_date    = article["published_date"]
        source          = article["url"]
        sentiment       = article["sentiment.label"]
        image           = article["image"]
        #descr           = article["description"]

        html_content    += f'\t<tr id="article_{index}" onclick="selectRow(this)"><td>\n'
        html_content    += f'\t<table><tr><td>'
        if image.strip() :
            html_content += f'<img src={image} style="height:100px;object-fit:cover;" />'

        html_content    += '</td><td>'
        html_content    += f"\t\t<h3>{index+1}. {title}</h3>\n"
        html_content    += "<div>"

        if publish_date.strip():  
            publish_date = datetime.fromisoformat(publish_date.replace('Z', ''))
            
        html_content += f"\t\t{publish_date.strftime('%m/%d/%Y  %I:%M:%S %p')}\n"
        html_content    += "</div>"
        
        if len(sentiment) > 0:  
            html_content    += f'<div style="float: right;">Sentiment: {sentiment}</div>'
    
        html_content    += '<br />'
        
        if len(author) > 0:  
            html_content += f"\t\t{', '.join(author)}<br />\n"

        html_content    += f"\t\t{source}\n"
        html_content    += "\t\t</td></tr>\n"
        html_content    += "\t</table>"
        html_content    += "\t</td></tr>\n"

    html_content        += "</table>"
    return html_content

# *Test Code*

In [21]:
countries   = ["us", "fr"]
keyword     = "israel"

recommend   = ARU.get_article_recommendations(keyword)
display(recommend)

"Live: Death toll tops 800 in Israel, nears 700 in Gaza\nLive: Gaza witnessing 'unprecedented human catastrophe', UN says\nLive: Gaza witnessing 'unprecedented human catastrophe', UN says\nGaza witnessing 'unprecedented human catastrophe', UN says\nGaza witnessing 'unprecedented human catastrophe', UN says\nNetanyahu says Israeli bombardment of Gaza 'only the beginning'\nNetanyahu says Israeli bombardment of Gaza 'only the beginning'\nIsrael pushes deeper into Gaza as UN estimates 15,000 flee South\nAs Israel-Hamas war reaches 100-day mark, here's the conflict by numbers"

In [22]:
countries   = ["us", "fr"]
keyword     = "israel"

news        = AIU.get_article_data(country = countries, language = None, query = keyword)
#recommend   = ARU.get_article_recommendations(keyword)

articles_df = pd.json_normalize(news["news"])
display(articles_df.head())

#articles    = DAU.CreateArticleList(news)
#display(pd.json_normalize(articles))
#display(news)

#articles = DAU.CreateArticleList(news)
#display(pd.json_normalize(articles).head())

if len(countries) == 0 :           
    headlines = get_list_of_articles(keyword=keyword)

else: #len(countries) == 0 :
    headlines = get_list_of_articles(keyword=keyword, countries=countries)

display(headlines.head())

#output = generate_html_list(headlines)
#display(output)


Getting article data for query: israel
{'text': 'israel', 'source-countries': ['us', 'fr'], 'earliest-publish-date': datetime.datetime(2024, 4, 15, 17, 51, 20, 449500)}


Unnamed: 0,id,title,text,url,image,publish_date,author,authors,language,source_country,sentiment
0,208699020,Pritzker discusses Democratic National Convent...,In the wake of Monday's protests that shut dow...,https://www.nbcchicago.com/news/local/pritzker...,https://media.nbcchicago.com/2023/02/GettyImag...,2024-04-15 00:00:00,NBC Chicago Staff,[NBC Chicago Staff],en,us,-0.231
1,208699040,Rescued dogs from West Bank available for adop...,"Five dogs, rescued from the West Bank, are ava...",https://www.nbcchicago.com/news/local/rescued-...,https://media.nbcchicago.com/2024/04/image-63....,2024-04-15 00:00:00,Kate Chappell,[Kate Chappell],en,us,-0.186
2,208720192,Iran and Israel have a history of enmity. What...,Iran's dramatic aerial attack on Israel follow...,https://www.nbcwashington.com/news/national-in...,https://media.nbcwashington.com/2024/04/AP2410...,2024-04-15 00:00:00,Associated Press,[Associated Press],en,us,0.161
3,208709570,Iran attack puts pressure on US House speaker ...,"The US House speaker, Mike Johnson, has said h...",https://www.theguardian.com/us-news/2024/apr/1...,https://i.guim.co.uk/img/media/80b30137edf7388...,2024-04-15 01:07:23,Guardian staff and agencies,[Guardian staff and agencies],en,us,-0.113
4,208715848,Lessons From Gaza’s Most Vulnerable: Understan...,There are as many ways to survive war as those...,https://warontherocks.com/2024/04/lessons-from...,https://warontherocks.com/wp-content/uploads/2...,2024-04-15 07:45:37,Austin Knuppe,[Austin Knuppe],en,us,0.224


Getting article data for query: israel
{'text': 'israel', 'source-countries': ['us', 'fr'], 'earliest-publish-date': datetime.datetime(2024, 4, 15, 17, 51, 25, 101404)}


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Unnamed: 0,title,author,content,url,published_date,image,sentiment.label,sentiment.score
0,Racist Church's Leader Helps Vet GOP Candidate...,[Ryan Krull],"Depending on how a court case shakes out, anyo...",https://www.riverfronttimes.com/news/racist-ch...,2024-04-15 11:00:00,https://media2.riverfronttimes.com/riverfrontt...,NEGATIVE,0.970905
1,Reflecting on Sudan’s Civil War One Year Later,"[Amel Marhoum, Sara Bedri]",More Must-Reads From TIME Dua Lipa Manifested ...,https://time.com/6966065/sudan-civil-war-anniv...,2024-04-15 11:00:00,https://api.time.com/wp-content/uploads/2024/0...,POSITIVE,0.991577
2,Japanese Court Dismisses Death Row Inmates’ La...,[Chad de Guzman],More Must-Reads From TIME Dua Lipa Manifested ...,https://time.com/6966857/japan-death-penalty-i...,2024-04-15 10:55:00,https://api.time.com/wp-content/uploads/2024/0...,NEGATIVE,0.986764
3,Kyrgyzstan Adopts Law Targeting Foreign-Funded...,[Colleen Wood],"On April 2, Kyrgyzstan’s President Sadyr Japar...",https://thediplomat.com/2024/04/kyrgyzstan-ado...,2024-04-15 10:46:00,https://thediplomat.com/wp-content/uploads/202...,NEGATIVE,0.587906
4,Climate protesters arrested in street near Kam...,"[Jenny Jarvie, Rebecca Ellis, Terry Castleman]","Demanding more action on the climate crisis, y...",https://www.latimes.com/california/story/2024-...,2024-04-15 10:00:47,https://ca-times.brightspotcdn.com/dims4/defau...,NEGATIVE,0.986235


### Gradio : Layout / Subroutines / Application

In [23]:
# Create Layout of Gradio form
with gr.Blocks(fill_height=True, theme="") as app: #Interstellar #Base #dracula_revamped #storj_theme #minimalist #calm_seafoam
    with gr.Accordion("Parameters", open=True) as srch_parameters:
        with gr.Row() :
            text_keyword        = gr.Textbox(
                label           = "Search Term"
              , placeholder     = "Enter your search term here"
            )
            #text_language       = gr.Dropdown(
            #    choices         = language_choice
            #  , value           = "en"
            #  , interactive     = True
            #  , label           = "Output Language"
            #)

        with gr.Row() :
            text_source         = gr.CheckboxGroup(
                choices         = country_choice
              , label           = "Source Country (Leave unchecked for ALL)"
            )

        submit_btn = gr.Button("Search")
 
    with gr.Accordion("Search Results", open=False, visible=False) as srch_results:
        with gr.Row() :
            with gr.Column(scale=2) :  
                with gr.Tab("Article List") :
                    html_results    = gr.HTML()

                with gr.Tab("HTML Source") :
                    html_code       = gr.TextArea()

                with gr.Tab("Data") :
                    article_list    = gr.Dataframe()
    
            with gr.Column(scale=1) :
                article_id          = gr.Number(
                      label         = "Article Number to Retrieve:"
                    , minimum       = 0
                    , precision     = 0
                )
                text_language       = gr.Dropdown(
                      choices       = language_choice
                    , value         = "en"
                    , interactive   = True
                    , label         = "Output Language"
                )
                
                retrieve_btn        = gr.Button("Retrieve Article")#, size="sm")
                recommendation_btn  = gr.Button("Get Additional Recommendations")#, size="sm")
                
                plot_setiment_over_time = gr.Plot(
                      label     = "Sentiment Over Time"
                )

                plot_setiment_dist = gr.Plot(
                      label     = "Sentiment Distribution"
                )

    with gr.Accordion("Additional Reading", open=False, visible=False) as recommendations:
        with gr.Row() :
            with gr.Column(scale=2) :  
                with gr.Tab("Article List") :
                    html_addl_results   = gr.HTML()

                with gr.Tab("HTML Source") :
                    html_addl_code      = gr.TextArea()

                with gr.Tab("Data") :
                    article_addl_list   = gr.Dataframe()
    
            with gr.Column(scale=1) :
                article_addl_id     = gr.Number(
                      label         = "Article Number to Retrieve:"
                    , minimum       = 0
                    , precision     = 0
                )
                text_addl_language  = gr.Dropdown(
                      choices       = language_choice
                    , value         = "en"
                    , interactive   = True
                    , label         = "Output Language"
                )
                retrieve_more_btn = gr.Button("Retrieve Article")#, size="sm")  

    with gr.Accordion("Article", open=False, visible=False) as srch_article:
        with gr.Row() :
            html_article    = gr.TextArea(
                  elem_id   = "Chosen_Article"
                , label     = ""
                #, visible   = False
            )

    ## These function needs to be embedded in the Gradio Block 
    ## in order to control the Gradio Controls in the form
    
    # Search button function
    def submit_search(keyword, countries=[]) :
        # Get the top headlines for the country and language code and search term
        if len(countries) == 0 :           
            headlines = get_list_of_articles(keyword=keyword)

        else: #len(countries) == 0 :
            headlines = get_list_of_articles(keyword=keyword, countries=countries)

        html_content        = generate_html_list(headlines)
        params              = f"Parameters  (Search Term: {keyword}, Source Countries: {', '.join(countries).upper()})" #Language: {language}, 

        fig1 = plt.figure()
        plt.plot(headlines['published_date'], headlines['sentiment.score'], marker='o', linestyle='-')
        plt.title('Sentiment Score Over Time')
        plt.xlabel('Date')
        plt.ylabel('Sentiment Score')

        fig2 = plt.figure()
        sentiment_counts = headlines['sentiment.label'].value_counts()
        plt.pie(sentiment_counts.values, labels=sentiment_counts.index, autopct='%1.1f%%', colors=['red', 'green', 'blue'], startangle=140)
        plt.title('Sentiment Distribution')
        plt.axis('equal')


        return {
              srch_parameters           : gr.Accordion(open=False   , label=params)
            , srch_results              : gr.Accordion(open=True    , visible=True)
            , html_results              : html_content
            , html_code                 : html_content
            , article_list              : headlines
            , plot_setiment_over_time   : fig1
            , plot_setiment_dist        : fig2
        }
    
    # Get Additional Recommendations Retrive Button function
    def get_recommendations(keyword) :
        headlines       = get_list_of_recommendations(keyword)
        html_content    = generate_html_list(headlines)

        return {
              srch_parameters   : gr.Accordion(open=False)
            , srch_results      : gr.Accordion(open=False   , visible=True)
            , recommendations   : gr.Accordion(open=True    , visible=True)
            , html_addl_results : html_content
            , html_addl_code    : html_content
            , article_addl_list : headlines
        }

    # Retrieve Article Button function 
    def retrieve_article(id, articles_df, language=None) :
        
        # Prepare translation model and tokenizer
        #model_name          = "facebook/mbart-large-50-many-to-many-mmt"
        #model               = MBartForConditionalGeneration.from_pretrained(model_name)
        #tokenizer           = MBart50TokenizerFast.from_pretrained(model_name, max_length=512)
        #
        #tokenizer.src_lang  = language + "_" + language.upper
  
        # Retrieve article headline and translate
        #article_headline = ATU.TranslateArticleText(
        #      articles_df.iloc[id-1]["title"]
        #    , language
        #    , model
        #)

        ## Retrieve article text and translate
        #article_text = ATU.TranslateArticleText(
        #      articles_df.iloc[id-1]["content"]
        #    , language
        #    , model
        #)

        article_headline    = articles_df.iloc[id-1]["title"]
        article_text        = articles_df.iloc[id-1]["content"]

        # Create label for article display
        params = f"Article: {id-1}. {article_headline}"

        return {
              srch_parameters   : gr.Accordion(open=False)
            , srch_results      : gr.Accordion(open=False)
            , srch_article      : gr.Accordion(open=True, visible=True)
            , html_article      : gr.TextArea(value=article_text, label=params)
        }    
    
    submit_btn.click(
        fn        = submit_search
      , inputs    = [text_keyword, text_source]
      , outputs   = [srch_parameters, srch_results, html_results, html_code, article_list, plot_setiment_over_time, plot_setiment_dist]
    )

    recommendation_btn.click(
        fn        = get_recommendations
      , inputs    = [text_keyword]
      , outputs   = [srch_parameters, srch_results, recommendations, html_addl_results, html_addl_code, article_addl_list]
    )

    retrieve_btn.click(
        fn        = retrieve_article
      , inputs    = [article_id, article_list, text_language]
      , outputs   = [srch_parameters, srch_results, srch_article, html_article]
    )

app.launch(show_error=True)




Running on local URL:  http://127.0.0.1:7872

To create a public link, set `share=True` in `launch()`.




Getting article data for query: Trump
{'text': 'Trump', 'source-countries': ['us', 'gb', 'au'], 'earliest-publish-date': datetime.datetime(2024, 4, 15, 17, 54, 30, 72592)}


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Traceback (most recent call last):
  File "c:\Users\Joe\anaconda3\envs\Project3\Lib\site-packages\gradio\queueing.py", line 527, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Joe\anaconda3\envs\Project3\Lib\site-packages\gradio\route_utils.py", line 261, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Joe\anaconda3\envs\Project3\Lib\site-packages\gradio\blocks.py", line 1788, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Joe\anaconda3\envs\Project3\Lib\site