### Setup
Imports core libs (async, I/O, dates) plus `cognee`, `praw` (Reddit), and `feedparser`.  
Loads secrets from `.env` to configure APIs and runtime.


In [None]:
# üîê Environment Variables Setup

# TAVILY_API_KEY=your_tavily_api_key_here
# REDDIT_CLIENT_ID=your_reddit_client_id_here
# REDDIT_CLIENT_SECRET=your_reddit_client_secret_here
# REDDIT_USER_AGENT=your_app_name_or_username

In [2]:
# Import all necessary libraries
import asyncio
import os
import cognee
import praw
import tempfile
import feedparser
from openai import OpenAI
from openai import AsyncOpenAI
import os

from datetime import datetime, timedelta
from dotenv import load_dotenv
from pathlib import Path

# Load environment variables from .env file
load_dotenv()


[2m2025-11-13T16:22:32.343750[0m [[32m[1minfo     [0m] [1mDeleted old log file: /opt/homebrew/lib/python3.11/site-packages/logs/2025-10-29_12-28-05.log[0m [[0m[1m[34mcognee.shared.logging_utils[0m][0m

[2m2025-11-13T16:22:32.686294[0m [[32m[1minfo     [0m] [1mLogging initialized           [0m [[0m[1m[34mcognee.shared.logging_utils[0m][0m [36mcognee_version[0m=[35m0.3.6[0m [36mdatabase_path[0m=[35m/opt/homebrew/lib/python3.11/site-packages/cognee/.cognee_system/databases[0m [36mgraph_database_name[0m=[35m[0m [36mos_info[0m=[35m'Darwin 24.6.0 (Darwin Kernel Version 24.6.0: Mon Jul 14 11:30:40 PDT 2025; root:xnu-11417.140.69~1/RELEASE_ARM64_T8132)'[0m [36mpython_version[0m=[35m3.11.13[0m [36mrelational_config[0m=[35mcognee_db[0m [36mstructlog_version[0m=[35m25.4.0[0m [36mvector_config[0m=[35mlancedb[0m

[2m2025-11-13T16:22:32.687450[0m [[32m[1minfo     [0m] [1mDatabase storage: /opt/homebrew/lib/python3.11/site-packages/cognee

True

### Clean Slate
Wipes previously stored documents, nodes, and relationships. Only use this when you want a fresh/ cleaned memory
Clears system-level caches/metadata (indexes, config history) to ensure a fresh run.


In [3]:
if True:
    # delete stored data (documents, nodes, relationships)
    await cognee.prune.prune_data()
    print("Data pruned.")

    # clear system-level caches and metadata (indexes, config history, etc.)
    await cognee.prune.prune_system(metadata=True)
    print("Metadata pruned.")


[2m2025-11-13T16:22:33.682749[0m [[32m[1minfo     [0m] [1mLoaded JSON extension         [0m [[0m[1m[34mcognee.shared.logging_utils[0m][0m

[2m2025-11-13T16:22:33.701635[0m [[32m[1minfo     [0m] [1mDeleted Kuzu database files at /opt/homebrew/lib/python3.11/site-packages/cognee/.cognee_system/databases/cognee_graph_kuzu[0m [[0m[1m[34mcognee.shared.logging_utils[0m][0m


Data pruned.



[2m2025-11-13T16:22:36.363323[0m [[32m[1minfo     [0m] [1mDatabase deleted successfully.[0m [[0m[1m[34mcognee.shared.logging_utils[0m][0m

[1mStorage manager absolute path: /opt/homebrew/lib/python3.11/site-packages/cognee/.cognee_cache[0m

[1mDeleting cache...             [0m

[1m‚úì Cache deleted successfully! [0m


Metadata pruned.


### Ingest Reddit
Initializes Reddit API via PRAW using env credentials.  
Fetches latest posts from selected subreddits, writes them to temp files.  
Adds those files to Cognee for ingestion, then removes the temp files.


In [4]:
if True:
    #########################################################
    # Add reddit data to cognee pipeline
    #########################################################

    # --- Initialize Reddit API ---
    reddit = praw.Reddit(
        client_id=os.getenv("REDDIT_CLIENT_ID"),
        client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
        user_agent=os.getenv("REDDIT_USER_AGENT", "stay_on_top_of_news/1.0")
    )

    print("Reddit API initialized.")


    async def get_reddit_posts(subreddit_name, limit=25):
        """Fetch recent posts from a subreddit"""
        posts_text = []
        subreddit = reddit.subreddit(subreddit_name)
        
        for post in subreddit.new(limit=limit):
            # Evaluate catchiness using LLM (3 lines)            
            content = post.selftext if post.selftext else '[Link post - no text content]'
            prompt = f"Rate this news catchiness (1-10). Catchiness = controversial + attention-capturing + timeless + major breakthrough.\n\nTitle: {post.title}\nSummary: {content[:500]}\n\nReturn ONLY integer 1-10."
            
            client = AsyncOpenAI(api_key=os.getenv("LLM_API_KEY"), timeout=5.0)
            resp = await client.chat.completions.create(model="gpt-4o-mini", messages=[{"role": "user", "content": prompt}], max_tokens=5)
            score = int(resp.choices[0].message.content.strip())
                        
            post_content = f"""
                            Title: {post.title}
                            Subreddit: r/{subreddit_name}
                            Author: u/{post.author.name if post.author else '[deleted]'}
                            Score: {post.score}
                            Created: {datetime.fromtimestamp(post.created_utc)}
                            URL: {post.url}
                            Catchiness Score: {score}/10

                            Content:
                            {content}

                            ---
                            """
            posts_text.append(post_content)







        return "\n".join(posts_text)


    # Create temporary files for each subreddit's posts and define subreddits
    subreddits = [
        # "MachineLearning",
        "OpenAI",
        # "artificial",
        # "LocalLLaMA",
        # "AIMemory",
        # "GraphRAG",
        # "ollama",
        # "LLMDevs",
        # "Rag",
        # "datascience",
        # "ai",
        # "ArtificialInteligence", 
        # "deepLearning",
        # "AI_Agents",
        # "ChatGPT",
        # "Singularity",
        # "StableDiffusion",
        # "Midjourney",
        # "generative",
        # "aipromptprogramming",
        # "aiart"
    ]
    temp_files = []

    for sub in subreddits:
        try:
            print(f"Fetching posts from r/{sub}...")
            posts_content = await get_reddit_posts(sub, limit=25)
            
            # Create a temporary file with the posts
            temp_file = tempfile.NamedTemporaryFile(
                mode='w', 
                suffix=f'_{sub}_posts.txt',
                delete=False,
                encoding='utf-8'
            )
            temp_file.write(posts_content)
            temp_file.close()
            temp_files.append(temp_file.name)
            print(f"  ‚úì Fetched and saved posts from r/{sub}")
        except Exception as e:
            print(f"  ‚ö†Ô∏è  Error fetching r/{sub}: {type(e).__name__} - {str(e)}")
            print(f"  ‚Üí Skipping subreddit and continuing...")


    # --- Add Reddit posts to cognee ---
    await cognee.add(temp_files)
    print("‚úì Reddit posts added to cognee")

    # Clean up temporary files
    for temp_file in temp_files:
        try:
            Path(temp_file).unlink()
        except:
            pass


Reddit API initialized.
Fetching posts from r/OpenAI...
  ‚úì Fetched and saved posts from r/OpenAI
User 88e14804-92ba-4b99-81ab-6af593dadffe has registered.



[1mEmbeddingRateLimiter initialized: enabled=False, requests_limit=60, interval_seconds=60[0m

[2m2025-11-13T16:23:02.639914[0m [[32m[1minfo     [0m] [1mPipeline run started: `0e01cb9b-43ca-5d79-8c4b-1e967f899ad5`[0m [[0m[1m[34mrun_tasks_with_telemetry()[0m][0m

[2m2025-11-13T16:23:02.871485[0m [[32m[1minfo     [0m] [1mCoroutine task started: `resolve_data_directories`[0m [[0m[1m[34mrun_tasks_base[0m][0m

[2m2025-11-13T16:23:03.052900[0m [[32m[1minfo     [0m] [1mCoroutine task started: `ingest_data`[0m [[0m[1m[34mrun_tasks_base[0m][0m

[2m2025-11-13T16:23:03.255469[0m [[32m[1minfo     [0m] [1mRegistered loader: pypdf_loader[0m [[0m[1m[34mcognee.infrastructure.loaders.LoaderEngine[0m][0m

[2m2025-11-13T16:23:03.256150[0m [[32m[1minfo     [0m] [1mRegistered loader: text_loader[0m [[0m[1m[34mcognee.infrastructure.loaders.LoaderEngine[0m][0m

[2m2025-11-13T16:23:03.256621[0m [[32m[1minfo     [0m] [1mRegistered loader: ima

‚úì Reddit posts added to cognee


### Ingest RSS News
Defines a small set of AI/tech RSS feeds.  
Parses each feed, formats the latest entries, saves to temp text files.  
Adds all parsed feeds to Cognee and cleans up the temp files.


In [5]:

#########################################################
# News Sources
#########################################################

# minimal rss list you can extend
rss_feeds = [
	# --- Research ---
	"https://export.arxiv.org/rss/cs.LG",           # machine learning
	# "https://export.arxiv.org/rss/cs.AI",           # artificial intelligence
	# "https://export.arxiv.org/rss/cs.CL",           # computation & language
	# "https://export.arxiv.org/rss/cs.CV",           # computer vision
	# "https://export.arxiv.org/rss/stat.ML",         # statistical ML
	# "https://export.arxiv.org/rss/eess.IV",         # image/video systems (vision)

	# --- Major AI labs & research orgs ---
	"https://openai.com/news/rss.xml",
	# "https://ai.googleblog.com/feeds/posts/default?alt=rss",
	# "https://deepmind.google/discover/blog/feed.xml",     # DeepMind
	# "https://research.ibm.com/blog/rss.xml",              # IBM Research
	# "https://blogs.microsoft.com/ai/feed/",               # Microsoft AI blog
	# "https://ai.meta.com/blog/rss.xml",                   # Meta AI
	# "https://huggingface.co/blog/feed.xml",               # Hugging Face

	# --- Industry / analysis / newsletters ---
	# "https://www.technologyreview.com/feed/",             # MIT Tech Review
	# "https://www.therundown.ai/feed",                     # The Rundown AI
	# "https://www.theneurondaily.com/feed",                # The Neuron
	# "https://www.semiaanalysis.com/feed",                 # SemiAnalysis
	# "https://www.oneusefulthing.org/feed",                # Ethan Mollick
	# "https://importai.substack.com/feed",                 # Jack Clark‚Äôs Import AI
	# "https://www.lesswrong.com/feed.xml",                 # AI safety / rationalist posts
	# "https://alignmentforum.org/feed.xml",                # alignment research forum

	# --- Tech / product news outlets ---
	# "https://techcrunch.com/tag/ai/feed/",
	# "https://www.theverge.com/rss/index.xml",
	# "https://venturebeat.com/category/ai/feed/",
	# "https://www.wired.com/feed/category/ai/latest/rss",  # Wired AI
	# "https://thenextweb.com/feed/",                       # The Next Web
	# "https://spectrum.ieee.org/feed",                     # IEEE Spectrum (engineering AI)

	# --- Policy, ethics, society ---
	# "https://hai.stanford.edu/rss.xml",                   # Stanford HAI
	# "https://www.brookings.edu/topic/artificial-intelligence/feed/",  # Brookings AI policy
	# "https://datainnovation.org/feed/",                   # Center for Data Innovation

	# --- Optional fun / creative ---
	# "https://towardsdatascience.com/feed",                # Medium ML articles
	# "https://www.kdnuggets.com/feed",                     # data science & AI
]


# Parse RSS feeds and save to temp files
print(f"\nParsing {len(rss_feeds)} RSS feeds...")
rss_temp_files = []

for feed_url in rss_feeds:
    try:
        print(f"  Parsing {feed_url}...")
        feed = feedparser.parse(feed_url)
        
        if not feed.entries:
            print(f"  ‚ö†Ô∏è  No entries found")
            continue
        
        # Format feed content
        feed_text = [f"RSS Feed: {feed.feed.get('title', feed_url)}\n"]
        feed_text.append("=" * 80 + "\n")
        
        for entry in feed.entries[:10]:  # Get latest 10 entries
            # Evaluate catchiness using LLM
            title, summary = entry.get('title', 'No title'), entry.get('summary', entry.get('description', 'No summary available'))
            prompt = f"Rate this news catchiness (1-10). Catchiness = controversial + attention-capturing + timeless + major breakthrough.\n\nTitle: {title}\nSummary: {summary[:500]}\n\nReturn ONLY integer 1-10."
            
            client = AsyncOpenAI(api_key=os.getenv("LLM_API_KEY"), timeout=5.0)
            resp = await client.chat.completions.create(model="gpt-4o-mini", messages=[{"role": "user", "content": prompt}], max_tokens=5)
            score = int(resp.choices[0].message.content.strip())
            
            
            feed_text.append(f"""
                Title: {entry.get('title', 'No title')}
                Published: {entry.get('published', entry.get('updated', 'N/A'))}
                Link: {entry.get('link', 'N/A')}
                Catchiness Score: {score}/10

                Summary:
                {entry.get('summary', entry.get('description', 'No summary available'))}

                ---
                """)
        
        # Save to temp file
        temp_file = tempfile.NamedTemporaryFile(
            mode='w', suffix='_rss.txt', delete=False, encoding='utf-8'
        )
        temp_file.write("\n".join(feed_text))
        temp_file.close()
        rss_temp_files.append(temp_file.name)
        print(f"  ‚úì Saved")
        
    except Exception as e:
        print(f"  ‚úó Failed: {e}")

# Add RSS feeds to cognee
if rss_temp_files:
    print("\nAdding RSS feeds to cognee...")
    await cognee.add(rss_temp_files)
    print("‚úì RSS feeds added")
    
    # Cleanup
    for temp_file in rss_temp_files:
        try:
            Path(temp_file).unlink()
        except:
            pass
else:
    print("‚ö†Ô∏è  No RSS feeds were parsed")




Parsing 2 RSS feeds...
  Parsing https://export.arxiv.org/rss/cs.LG...
  ‚úì Saved
  Parsing https://openai.com/news/rss.xml...



[2m2025-11-13T16:23:17.527875[0m [[32m[1minfo     [0m] [1mPipeline run started: `0e01cb9b-43ca-5d79-8c4b-1e967f899ad5`[0m [[0m[1m[34mrun_tasks_with_telemetry()[0m][0m


  ‚úì Saved

Adding RSS feeds to cognee...



[2m2025-11-13T16:23:17.699601[0m [[32m[1minfo     [0m] [1mCoroutine task started: `resolve_data_directories`[0m [[0m[1m[34mrun_tasks_base[0m][0m

[2m2025-11-13T16:23:17.874630[0m [[32m[1minfo     [0m] [1mCoroutine task started: `ingest_data`[0m [[0m[1m[34mrun_tasks_base[0m][0m

[2m2025-11-13T16:23:18.058720[0m [[32m[1minfo     [0m] [1mPipeline run started: `0e01cb9b-43ca-5d79-8c4b-1e967f899ad5`[0m [[0m[1m[34mrun_tasks_with_telemetry()[0m][0m

[2m2025-11-13T16:23:18.232942[0m [[32m[1minfo     [0m] [1mCoroutine task started: `resolve_data_directories`[0m [[0m[1m[34mrun_tasks_base[0m][0m

[2m2025-11-13T16:23:18.412615[0m [[32m[1minfo     [0m] [1mCoroutine task started: `ingest_data`[0m [[0m[1m[34mrun_tasks_base[0m][0m

[2m2025-11-13T16:23:18.639918[0m [[32m[1minfo     [0m] [1mCoroutine task completed: `ingest_data`[0m [[0m[1m[34mrun_tasks_base[0m][0m

[2m2025-11-13T16:23:18.819808[0m [[32m[1minfo     [0m] [1mCo

‚úì RSS feeds added


### Build & Enhance the Graph
`cognify()` extracts entities/relations to build the knowledge graph.  
Saves an initial HTML visualization.  
`memify()` consolidates memory to strengthen connections.  
Saves an enhanced HTML visualization for comparison.


In [6]:
# extract knowledge from the scraped data
await cognee.cognify()
print("Knowledge graph created.")

# visualize the knowledge graph
simple_graph_visualization_path = str(Path.cwd() / "simple_graph_visualization.html")
await cognee.visualize_graph(simple_graph_visualization_path)
print(f"Data visualized")



[2m2025-11-13T16:23:19.733292[0m [[32m[1minfo     [0m] [1mOntology file 'None' not found. No owl ontology will be attached to the graph.[0m [[0m[1m[34mOntologyAdapter[0m][0m

[2m2025-11-13T16:23:19.779455[0m [[32m[1minfo     [0m] [1mPipeline run started: `520fb50a-9435-5383-be87-3c6dedd16fdf`[0m [[0m[1m[34mrun_tasks_with_telemetry()[0m][0m

[2m2025-11-13T16:23:19.970566[0m [[32m[1minfo     [0m] [1mCoroutine task started: `classify_documents`[0m [[0m[1m[34mrun_tasks_base[0m][0m

[2m2025-11-13T16:23:20.144092[0m [[32m[1minfo     [0m] [1mCoroutine task started: `check_permissions_on_dataset`[0m [[0m[1m[34mrun_tasks_base[0m][0m

[2m2025-11-13T16:23:20.333297[0m [[32m[1minfo     [0m] [1mPipeline run started: `520fb50a-9435-5383-be87-3c6dedd16fdf`[0m [[0m[1m[34mrun_tasks_with_telemetry()[0m][0m

[2m2025-11-13T16:23:20.512001[0m [[32m[1minfo     [0m] [1mCoroutine task started: `classify_documents`[0m [[0m[1m[34mrun_tasks_b

Knowledge graph created.



[2m2025-11-13T16:26:02.673804[0m [[32m[1minfo     [0m] [1mGraph visualization saved as /Users/lstromann/projects/community/experimental/ai_news_agent/simple_graph_visualization.html[0m [[0m[1m[34mcognee.shared.logging_utils[0m][0m

[2m2025-11-13T16:26:02.674589[0m [[32m[1minfo     [0m] [1mThe HTML file has been stored at path: /Users/lstromann/projects/community/experimental/ai_news_agent/simple_graph_visualization.html[0m [[0m[1m[34mcognee.shared.logging_utils[0m][0m


Data visualized


### Cross-Document Query
Runs a GRAPH_COMPLETION search: *‚ÄúWhat were the main events in AI this week?‚Äù*  
Returns a synthesized answer aggregated across Reddit + RSS sources.


In [7]:
# demonstrate cross-document knowledge retrieval from multiple data sources
results = await cognee.search(
    query_text="What were the main events in the AI world this week?",
    query_type=cognee.SearchType.GRAPH_COMPLETION,
)
print(f"Search results: {results[0]}")


[2m2025-11-13T16:26:02.917917[0m [[32m[1minfo     [0m] [1mGraph projection completed: 201 nodes, 566 edges in 0.01s[0m [[0m[1m[34mCogneeGraph[0m][0m

[2m2025-11-13T16:26:03.352478[0m [[32m[1minfo     [0m] [1mVector collection retrieval completed: Retrieved distances from 6 collections in 0.14s[0m [[0m[1m[34mcognee.shared.logging_utils[0m][0m


Search results: This week‚Äôs AI highlights: Reddit discussions on the dark side of AI and concerns about declining AI quality; broader discussion on AI progress and safety recommendations; Wired report that OpenAI‚Äôs open-weight models are being used by the US military; TEDx talk on how artists can protect work from AI; user complaints about missing OpenAI developer/product features; Deezer/Ipsos survey finding 97% of people can‚Äôt tell AI-generated music from human-made; Notion rebuilt its stack with GPT‚Äë5 to enable agentic/autonomous workflows; BBVA scaled ChatGPT Enterprise across the organization (20,000+ Custom GPTs) with reported efficiency gains (article published by OpenAI).


# Stuff

In [9]:
# generate the second graph visualization after memory enhancement using custom visual.py
import importlib
import visual
importlib.reload(visual)  # Reload to pick up latest changes
from visual import cognee_network_visualization
from cognee.infrastructure.databases.graph import get_graph_engine

# Get graph data from cognee
graph_engine = await get_graph_engine()
graph_data = await graph_engine.get_graph_data()

# Use custom visualization with catchiness-based brightness
fancy_graph_visualization_path = str(Path.cwd() / "fancy_graph_visualization.html")
await cognee_network_visualization(graph_data, fancy_graph_visualization_path)
print(f"Fancy graph visualized with custom catchiness coloring: {fancy_graph_visualization_path}")


[2m2025-11-13T16:29:55.796230[0m [[32m[1minfo     [0m] [1mGraph visualization saved as /Users/lstromann/projects/community/experimental/ai_news_agent/fancy_graph_visualization.html[0m [[0m[1m[34mcognee.shared.logging_utils[0m][0m



=== DocumentChunk #1 Debug ===
Text preview (first 200 chars): RSS Feed: cs.LG updates on arXiv.org



                Title: A Lightweight CNN-Attention-BiLSTM Architecture for Mult
Catchiness match: <re.Match object; span=(394, 416), match='Catchiness Score: 4/10'>
Catchiness extracted: 4
Checking for 'Subreddit: r/' in text: False
Checking for 'arxiv.org' in text: True
Checking for 'RSS Feed:' in text: True
üü¢ IDENTIFIED AS RESEARCH/ARXIV
Research Feed: cs.LG updates on arXiv.org
Flags - Reddit: False, Research: True, Other: False
üü¢ Base color set to DARK GREEN (Research)
Final color after brightness adjustment (catchiness=4.0): #478f47

=== DocumentChunk #2 Debug ===
Text preview (first 200 chars): RSS Feed: OpenAI News



                Title: Neuro drives national retail wins with ChatGPT Business
               
Catchiness match: <re.Match object; span=(314, 336), match='Catchiness Score: 6/10'>
Catchiness extracted: 6
Checking for 'Subreddit: r/' in text: False
Checking 