In [1]:
# pip install GoogleNews

In [2]:
# Import GoogleNews package

from GoogleNews import GoogleNews
gnews_ex = GoogleNews()

In [3]:
# Check and set initial configuration

print(gnews_ex.getVersion()) # version will appear if the package was installed/imported successfully
gnews_ex.enableException(True) # Setting exceptions to show
gnews_ex = GoogleNews(lang='en') # Setting English language
gnews_ex.set_encode('utf-8') # Setting encoding

1.6.8


In [9]:
# Defining a method for easy usability throughout and in future
# A good practice to abstract the underlying functionality and only keep direct usable interfaces based on the objective

import pandas as pd

def getNewsAsDf(searchQuery, periodInDays):
    gnews_ex.set_period(periodInDays)    
    gnews_ex.search(searchQuery)    
    newsRes = gnews_ex.result()
    newsDf = pd.DataFrame(newsRes)
    gnews_ex.clear() # according the documentation, it is recommended to clear the list before another search
    return newsDf

def getRawNews(searchQuery, periodInDays):
    gnews_ex.set_period(periodInDays)    
    gnews_ex.search(searchQuery)    
    newsRes = gnews_ex.result()
    return newsRes

In [25]:
# Using the defined method, providing with searchQuery and perid in number of days

dfNews = getNewsAsDf('AI as anchor', '7d')
dfNews.head()

Unnamed: 0,title,media,date,datetime,desc,link,img
0,"Tim Cook covers Vision Pro, ChatGPT AI, and mo...",BGR,7 hours ago,2023-06-06 22:04:41.429497,"Apple's CEO Tim Cook announced the Vision Pro,...",https://bgr.com/tech/tim-cook-covers-vision-pr...,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////..."
1,Watch: Evening News Headlines From Aaj Tak AI ...,YouTube,7 hours ago,2023-06-06 22:04:41.430496,The Evening News Headlines From Aaj Tak AI Anc...,https://www.youtube.com/watch?v=f4Gmk1kzy6w,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////..."
2,Watch: International Headlines From Aaj Tak AI...,YouTube,1 hours ago,2023-06-07 04:04:41.431496,International Headlines From Aaj Tak AI Anchor...,https://www.youtube.com/watch?v=C7c--FLbgGg,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////..."
3,This AI tool may let news reporters get back t...,Straight Arrow News,1 day ago,2023-06-06 05:04:41.442495,HeyWire AI is developing a self-prompting AI n...,https://straightarrownews.com/cc/ai-tool-may-l...,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////..."
4,This AI tool may let news reporters get back t...,Brentwood Press,1 day ago,2023-06-06 05:04:41.454495,HeyWire AI is developing a self-prompting AI n...,https://www.thepress.net/news/national/this-ai...,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////..."


## NLTK

In [6]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shanover\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Shanover\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Shanover\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Shanover\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [11]:
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree

In [23]:
newsResults = getRawNews('AI as anchor','7d') # had to modify the query string a bit to get more news

titles = []
for item in newsResults:
    title = item.get('title')
    if title and title not in titles: # check if the title has already been added or not (because I found repetitive news)
        titles.append(title)
        
print(titles)

['Tim Cook covers Vision Pro, ChatGPT AI, and more in new interview', 'Watch: Evening News Headlines From Aaj Tak AI Anchor Sana', 'Watch: International Headlines From Aaj Tak AI Anchor Sana', "This AI tool may let news reporters get back to journalism 'out there in the world'", 'AI journalism in India: Did India enter a new era of broadcasting?', 'Trump shared a disturbing AI video of gay CNN anchor Anderson Cooper', 'Watch: Afternoon News Headlines From Aaj Tak AI Anchor Sana', 'Beijing launches campaign against AI-generated misinformation', 'Trump shares AI-altered fake clip of Anderson Cooper', 'China cracks down on AI-generated news anchors', 'Who is Sana AI Anchor Robot?', 'How AI is affecting the future of journalism | DW News', 'Afternoon News Headlines From Aaj Tak AI Anchor Sana | WATCH']


In [26]:
text = ' '.join(titles) # convert list to a string, so as to feed to the tokenizer in the next step

nltk_results = ne_chunk(pos_tag(word_tokenize(text)))
for nltk_result in nltk_results:
    if type(nltk_result) == Tree:
        name = ''
        for nltk_result_leaf in nltk_result.leaves():
            name += nltk_result_leaf[0] + ' '
        print ('Type: ', nltk_result.label(), 'Name: ', name)

Type:  PERSON Name:  Tim 
Type:  ORGANIZATION Name:  Cook 
Type:  PERSON Name:  Vision Pro 
Type:  ORGANIZATION Name:  ChatGPT 
Type:  PERSON Name:  Watch 
Type:  ORGANIZATION Name:  News 
Type:  PERSON Name:  Sana Watch 
Type:  ORGANIZATION Name:  International Headlines 
Type:  GPE Name:  India 
Type:  PERSON Name:  Trump 
Type:  ORGANIZATION Name:  CNN 
Type:  PERSON Name:  Anderson Cooper Watch 
Type:  PERSON Name:  Afternoon News Headlines 
Type:  PERSON Name:  Sana Beijing 
Type:  ORGANIZATION Name:  Trump 
Type:  PERSON Name:  Anderson Cooper China 
Type:  PERSON Name:  Sana AI Anchor Robot 
Type:  PERSON Name:  Sana 
