# Find AI NEWS 

In [37]:
AI_SEARCH_QUERIES = [
    ["New AI model", "AI model release", "AI model launch", "AI model update", "AI model upgrade", "AI model release date"],
    ["AI open source", "HuggingFace", "AI Github", "AI Llama"],
    ["AI breakthrough", "AI innovation"],
    ["AI Tool", "AI feature", "AI software", "AI product", "AI service", "AI platform", "AI API", "AI application", "AI project"],
    ["AI Assistant", "AI Personal Assistant"],
    ["AI report", "AI book", "AI journal", "AI conference", "AI event", "AI competition", "AI challenge", "AI hackathon", "AI workshop"],
    ["AI interview", "AI debate", "AI forum"],
    ["AI business", "AI profession", "AI career", "AI job", "AI work", "AI employment", "AI salary", "AI income", "AI wage", "AI money"],
    ["Generative AI"],
    ["OpenAI", "GPT"],
    ["Anthropic AI", "Claude AI"],
    ["Gemini AI", "Google AI", "Deepmind"],
    ["Mistral AI"],
    ["Cohere AI"],
    ["Meta AI"],
    ["Perplexity AI"],
    ["Microsoft AI", "Microsoft Copilot"],
    ["AI Powered device"],
    ["AI Robotics", "AI humanoid", "AI robot", "AI robots"],
    ["AI research", "AI research paper", "AI research lab", "AI researcher"],
    ["AI Agents", "AI Agent", "Autonomous AI", "AI Collaboration"],
    ["LLM", "Large Language Model", "LLM Tokens", "LLM Context window"],
    ["LLM Prompt", "LLM Prompting", "Prompt Engineering", "Prompt Tuning", "Prompt Evaluation", "Zero-shot LLM", "Few-shot LLM"],
    ["LLM Training", "LLM Training data", "LLM Training cost"],
    ["AI Dataset", "AI Datasets"],
    ["AI Transformer", "AI Mamba"],
    ["LLM Mobile", "LLM Phone", "Embedded LLM", "LLM on device"],
    ["Small Language Model", "Small LLM", "AI SLM"],
    ["AI Langchain", "AI Framework", "AI Library", "CrewAI", "LlamaIndex"],
    ["AI Chatbot"],
    ["low code AI", "no code AI"],
    ["AI blockchain"],
    ["AI Regulations"],
    ["AI Intel", "AI groq", "AI AMD", "AI ARM", "AI Tesla Chips", "AI Tesla Dojo"],
    ["AI Hardware", "AI Chip demand", "AI chip competition"],
    ["AI Nvidia"],
    ["AI Apple"],
    ["AI Tesla", "AI Elon Musk"],
    ["Twitter AI", "AI Grok", "xAI"],
    ["AI Car", "AI self driving car", "AI autonomous car", "AI Waymo", "AI Cruise", "AI Uber", "AI Lyft"],
    ["AI drone"],
    ["AI Military"],
    ["AI Competition"],
    ["AI Startup", "AI Startups"],
    ["AI funding", "AI investment", "AI venture capital", "AI acquisition"],
    ["AI Avatar", "AI content generation", "AI content creation", "AI social media", "AI influencer"],
    ["AI Adobe"],
    ["AI tiktok", "AI Instagram", "AI Facebook", "AI Twitter", "AI Snapchat", "AI Youtube", "AI Reddit", "AI Pinterest", "AI Linkedin"],
    ["AI Problem solving", "AI Reasoning", "AI planning"],
    ["AI Cloud", "AI Cloud computing", "AI Cloud service", "AI Cloud platform"],
    ["AI AWS", "AI Azure", "AI Google Cloud"],
    ["AI Virtual Reality", "AI VR", "AI AR", "AI Augmented Reality"],
    ["AI Video game", "AI Gaming", "AI Game"],
    ["AI Education", "AI Learning", "AI School", "AI University"],
    ["Explainable AI", "AI interpretability", "AI explainability"],
    ["AI Image recognition", "Multimodal LLM"],
    ["AI Image generation", "Midjourney", "OpenAI Dalle", "OpenAI Dall-e", "Stable diffusion"],
    ["AI Video generation", "AI Video", "OpenAI Sora", "AI Video Analysis", "Google AI Lumiere"],
    ["AI Music generation", "AI Music"],
    ["AI Voice generation", "AI Voice synthesis", "AI voice recognition"],
    ["AI Scandal", "AI Controversy", "AI trial", "AI lawsuit"],
    ["AI Hack", "AI Cybersecurity", "AI Attack", "AI Jailbreak"],
    ["AI Deepfake", "AI Deepfakes", "AI Deepfake detection", "AI Deepfake creation"],
    ["AI detector", "AI detection", "AI detection tool", "AI detection software"],
    ["AI Coding", "AI Programming", "Github Copilot", "AI Github", "AI Autonomous coding"],
    ["AI AGI", "AI ASI", "AI Singularity", "AI Superintelligence", "AI SSI"],
    ["AI Consciousness", "AI Sentience", "AI Emotion", "AI Creativity", "AI Art"],
    ["AI Turing test", "AI Benchmark", "AI Evaluation"],
    ["AI Brain", "AI Brain chip"],
    ["Sam Altman", "Elon Musk AI", "Yann LeCun", "Ilya Sutskever", "Stuart Russell", "Demis Hassabis", "Jensen Huang"],
    ["AI Economics", "AI Strategy", "AI Geopolitics"],
    ["AI Power Usage", "AI Energy consumption"],
    ["AI healthcare"]
]

# TODO : localised searches "AI China | IA France | AI Germany | AI UK | AI US | AI Russia | AI India | AI Japan | AI Korea | AI Canada | AI Australia",

# Perform the search

In [38]:
%pip install -q duckduckgo-search[lxml] tqdm pytz

from duckduckgo_search import DDGS
from pprint import pprint

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [39]:
from tqdm import tqdm
import time
from datetime import datetime
import pytz


SLEEP_TIME = 0.2
MAX_RESULTS = 30
TIME_LIMIT = "d"

duplicates = []

def perform_search(queries:list[str]):
    results = {}
    ddgs = DDGS()

    with tqdm(total=len(queries), desc="Searching") as pbar:
        for query in queries:
            pbar.set_postfix(query=query)
            search_results = ddgs.news(query, max_results=MAX_RESULTS, timelimit=TIME_LIMIT)
            for result in search_results:
                if result['url'] not in results:
                    result['found_at'] = datetime.now(pytz.utc).isoformat()
                    results[result['url']] = result
                else:
                    duplicates.append(result)
            time.sleep(SLEEP_TIME)
            pbar.update(1)
    
    return results


results = perform_search([query for queries in AI_SEARCH_QUERIES for query in queries])

print(f"Found {len(results)} unique results and {len(duplicates)} duplicates")

Searching: 100%|██████████| 244/244 [02:30<00:00,  1.62it/s, query=AI healthcare]        

Found 2160 unique results





### Convert the dates to datetime objects

In [40]:
for result in results.values():
    if isinstance(result['date'], str):
        result['date'] = datetime.fromisoformat(result['date'])
    if isinstance(result['found_at'], str):
        result['found_at'] = datetime.fromisoformat(result['found_at'])

assert all(isinstance(result['date'], datetime) for result in results.values())
assert all(isinstance(result['found_at'], datetime) for result in results.values())

# MongoDB Export

In [None]:
%pip install -q pymongo

In [41]:
from dotenv import load_dotenv
from pymongo import MongoClient
import os

load_dotenv()

client = MongoClient(os.getenv("MONGODB_URI"))

db = client["blogdb"]

collection = db["ai_news"]

In [42]:
from pymongo.errors import BulkWriteError

assert all(isinstance(result['date'], datetime) for result in results.values())
assert all(isinstance(result['found_at'], datetime) for result in results.values())


try:
    result = collection.insert_many(results.values(), ordered=False)
    print(f"Inserted {len(result.inserted_ids)} documents")
except BulkWriteError as e:
    print(f"Inserted {e.details['nInserted']} documents")
    print(f"Encountered {len(e.details['writeErrors'])} errors") # number of duplicates


Inserted 2108 documents
Encountered 52 errors


# File Export 

In [6]:
from datetime import date

### CSV

In [7]:
import pandas as pd

file_name = f"ai_news_{date.today().isoformat()}.csv"

df = pd.DataFrame(results.values())

df.to_csv(file_name, index=False)


### JSON

In [49]:
from json import dumps


file_name = f"ai_news_{date.today().isoformat()}.json"

with open(file_name, "w") as f:
    f.write(dumps(results, indent=4))
