## Data scrupping

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import os
from urllib.parse import urljoin, urlparse
import time
from datetime import datetime
import logging
import re

class BatchScraper:
    def __init__(self, base_url="https://www.deeplearning.ai/the-batch/"):
        self.base_url = base_url.rstrip('/')
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/125.0.0.0 Safari/537.36"
    ),
    "Accept": (
        "text/html,application/xhtml+xml,application/xml;q=0.9,"
        "image/avif,image/webp,image/apng,*/*;q=0.8"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.google.com/",
    "Connection": "keep-alive",
        })

    def scrape_articles(self, max_articles=400):
        """Scrape articles from The Batch"""
        all_articles = []
        issue_links = self.get_issue_links()

        for issue_url in issue_links:
            articles = self.scrape_single_article(issue_url)
            for article in articles:
                if len(all_articles) >= max_articles:
                    return all_articles
                all_articles.append(article)
            time.sleep(1)

        return all_articles
        """Get all issue URLs"""
        try:
            response = self.session.get(self.base_url)
            soup = BeautifulSoup(response.text, 'html.parser')
            links = soup.select("a[href*='/the-batch/']")
            issue_links = set()

            for link in links:
                href = link.get('href')
                if href.startswith('/the-batch/') and href.count('/') > 2:                        
                    full_url = urljoin("https://www.deeplearning.ai", href.rstrip('/'))
                    issue_links.add(full_url)

            return list(issue_links)
        except Exception as e:
            logging.error(f"Error retrieving issue links: {e}")
            return []

    def get_all_issue_urls(self):
        issue_links = set()
        links_on_links=set()
        page_num = 1
        
        while True:
            if page_num == 1:
                url = "https://www.deeplearning.ai/the-batch/"  # перша сторінка без /page/1/
            else:
                url = f"{self.base_url}/page/{page_num}/"

            try:
                response = self.session.get(url)
                if response.status_code == 404:
                    display("Page not found, stopping scraping."+url)
                    break  # закінчити цикл, якщо сторінка не знайдена

                soup = BeautifulSoup(response.text, 'html.parser')
                links = soup.select("a[href*='/the-batch/']")

                for link in links:
                    href = link.get('href')
                    if href.startswith('/the-batch/') and href.count('/') > 2 and "the-batch/page/" not in href :
                        if "/tag/" in href:
                            links_on_links.add(urljoin("https://www.deeplearning.ai", href.rstrip('/')))
                        else:
                            full_url = urljoin("https://www.deeplearning.ai", href.rstrip('/'))
                            issue_links.add(full_url)

                page_num += 1
                time.sleep(1)  # пауза в 1 секунду між запитами

            except Exception as e:
                logging.error(f"Error retrieving issue links from {url}: {e}")
                break
        for url in links_on_links:
            try:
                response = self.session.get(url)
                if response.status_code != 200:
                    display(f"Page not found, skipping {url}")
                    continue

                soup = BeautifulSoup(response.text, 'html.parser')
                links = soup.select("a[href*='/the-batch/']")

                for link in links:
                    href = link.get('href')
                    if href.startswith('/the-batch/') and href.count('/') > 2 and "the-batch/page/" not in href and "/tag/" not in href:
                        full_url = urljoin("https://www.deeplearning.ai", href.rstrip('/'))
                        issue_links.add(full_url)

                time.sleep(1)

            except Exception as e:
                logging.error(f"Error processing links_on_links url {url}: {e}")

        json.dump(list(issue_links), open("issue_links.json", "w", encoding="utf-8"), ensure_ascii=False, indent=4)
        json.dump(list(links_on_links), open("links_on_links.json", "w", encoding="utf-8"), ensure_ascii=False, indent=4)
        return list(issue_links)
    
    def scrape_issue_article(self, url):
        """Scrape all articles from a single The Batch issue page"""
        
        articles = []
        try:
            response = self.session.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')

            content_blocks = soup.find_all(['h1',"h2",'p', 'img'])
            current_article = {}
            article_started = False
            found_news_heading = False

            last_img = None  # Зберігаємо останнє зображення перед заголовком

            for tag in content_blocks:
                if (tag.name == 'h2' or tag.name =="h1" ) and "news" in tag.get_text(strip=True).lower():
                    found_news_heading = True  # 👈 дозволяємо почати обробку після цього
                    continue  # пропускаємо сам заголовок "news"
                elif not found_news_heading:
                    continue  # ⛔ пропускаємо все до заголовка "news"
                if tag.name == 'img':
                    img_src = tag.get('src')
                    if img_src == "/_next/image/?url=%2F_next%2Fstatic%2Fmedia%2Fdlai-batch-logo.a60dbb9f.png&w=640&q=75":
                        # Skip this article by breaking out of the loop
                        articles = []
                        continue
                    if img_src and not img_src.startswith("data:"):
                        last_img = {
                            'url': urljoin(url, img_src),
                            'alt': tag.get('alt', ''),
                            'caption': self.extract_image_caption(tag)
                        }

                elif tag.name in ['h2', 'h1'] :
                    if article_started and current_article:
                        current_article['url'] = url
                        pub_date_elem = soup.find('div', class_='mt-1 text-slate-600 text-base text-sm')
                        print(f"pub_date_elem: {pub_date_elem}")
                        if pub_date_elem and pub_date_elem.get_text(strip=True):
                            current_article['publication_date'] = pub_date_elem.get_text(strip=True)
                        else:
                            current_article['publication_date'] = str(datetime.now())
                        current_article['scraped_at'] = str(datetime.now())
                        articles.append(current_article)
                        current_article = {}

                    current_article['title'] = tag.get_text(strip=True)
                    current_article['content'] = ""
                    current_article['images'] = []

                    if last_img:
                        current_article['images'].append(last_img)
                        last_img = None  # Очистити, щоб не дублювати в наступній статті

                    article_started = True

                elif tag.name == 'p' and article_started:
                    current_article['content'] += tag.get_text(strip=True) + " "

                elif tag.name == 'img' and article_started:
                    img_src = tag.get('src')
                    if img_src and not img_src.startswith("data:"):
                        img_url = urljoin(url, img_src)
                        current_article['images'].append({
                            'url': img_url,
                            'alt': tag.get('alt', ''),
                            'caption': self.extract_image_caption(tag)
                        })

            if current_article and article_started:
                current_article['url'] = url
                pub_date_elem = soup.find('div', class_='mt-1 text-slate-600 text-base text-sm')
                current_article['publication_date'] = pub_date_elem.get_text(strip=True) or str(datetime.now())
                current_article['scraped_at'] = str(datetime.now())
                articles.append(current_article)

            return articles

        except Exception as e:
            logging.error(f"Error scraping {url}: {e}")
            return []

    def scrape_simple_article(self, url):
        """Scrape a single The Batch article page (title, date, paragraphs, main image)"""
        article = {}

        try:
            response = self.session.get(url)
            if response.status_code != 200:
                logging.error(f"Failed to fetch {url}: {response.status_code}")
                return []

            soup = BeautifulSoup(response.text, 'html.parser')

            # Заголовок
            title_tag = soup.find('h1')
            article['title'] = title_tag.get_text(strip=True) if title_tag else 'No Title'

            # Публікація
            date_tags = soup.find_all('div', class_="inline-flex px-3 py-1 text-sm font-normal transition-colors rounded-md bg-slate-200 hover:bg-slate-300 text-slate-500")
            for tag in date_tags:
                text = tag.get_text(strip=True)
                if re.search(r'\d', text):
                    article['publication_date'] = text
                    break  # зупинимося на першому валідному
                else:
                    article['publication_date'] = None

            # Основне зображення
            main_img_tags = soup.find_all('img', attrs={'alt': True, 'srcset': True})
            for main_img_tag in main_img_tags:
                if "batch-logo."  not in main_img_tag.get('srcset', '') :
                    srcset = main_img_tag.get('srcset', '')
                    
                    # Вибираємо останнє зображення з найвищою роздільністю
                    last_img = srcset.split(',')[-1].strip().split(' ')[0]
                    article['image'] = {
                        'url': urljoin(url, last_img),
                        'alt': main_img_tag.get('alt', '')
                    }
                    break
                else:
                    article['image'] = None

            # Контент
            paragraphs = soup.find_all('p')
            content = "\n".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
            article['content'] = content

            # Метадані
            article['url'] = url
            article['scraped_at'] = str(datetime.now())

            # Зберегти у файл
            with open("articles2.json", "w", encoding="utf-8") as f:
                json.dump([article], f, ensure_ascii=False, indent=4)

            return [article]

        except Exception as e:
            logging.error(f"Error scraping {url}: {e}")
            return []

        """Extract publication date from soup"""
        date_elem = soup.find('time')
        return date_elem.get('datetime') if date_elem else None

    def download_images(self, articles, img_dir='data/images'):
        """Download images from articles"""
        os.makedirs(img_dir, exist_ok=True)
        
        for article in articles:
            images = article.get('images', [])
            if not images:
                continue
            for i, img_data in enumerate(images):
                try:
                    response = self.session.get(img_data['url'])
                    if response.status_code == 200:
                        parsed_url = urlparse(img_data['url'])
                        filename = f"{article['title'][:50]}_{i}_{os.path.basename(parsed_url.path)}"
                        filename = "".join(c for c in filename if c.isalnum() or c in '.-_')
                        filepath = os.path.join(img_dir, filename)
                        with open(filepath, 'wb') as f:
                            f.write(response.content)
                        img_data['local_path'] = filepath
                except Exception as e:
                    logging.error(f"Error downloading image {img_data['url']}: {e}")


get links

In [31]:
scraper= BatchScraper()
a=scraper.get_all_issue_urls()

'Page not found, stopping scraping.https://www.deeplearning.ai/the-batch/page/23/'

In [13]:

scraper= BatchScraper()
b=scraper.scrape_issue_article("https://www.deeplearning.ai/the-batch/issue-4")
b=scraper.scrape_simple_article("https://www.deeplearning.ai/the-batch/build-career-part-6")
c=scraper.scrape_simple_article("https://www.deeplearning.ai/the-batch/stability-ai-launches-stable-audio-a-text-to-music-generator-2")

pub_date_elem: <div class="mt-1 text-slate-600 text-base text-sm">Sep 11, 2019</div>
pub_date_elem: <div class="mt-1 text-slate-600 text-base text-sm">Sep 11, 2019</div>
pub_date_elem: <div class="mt-1 text-slate-600 text-base text-sm">Sep 11, 2019</div>
pub_date_elem: <div class="mt-1 text-slate-600 text-base text-sm">Sep 11, 2019</div>
pub_date_elem: <div class="mt-1 text-slate-600 text-base text-sm">Sep 11, 2019</div>
pub_date_elem: <div class="mt-1 text-slate-600 text-base text-sm">Sep 11, 2019</div>
pub_date_elem: <div class="mt-1 text-slate-600 text-base text-sm">Sep 11, 2019</div>


In [14]:

import json
import time
import logging

# Optional: set up logging to file
logging.basicConfig(filename="scrape_errors.log", level=logging.ERROR, format='%(asctime)s %(message)s')

results = []
failed_urls = []

# Load the URLs from JSON file
with open("issue_links.json", "r", encoding="utf-8") as f:
    issue_urls = json.load(f)

for idx, url in enumerate(issue_urls, start=1):
    if idx % 50 == 0:
        print(f"[{idx}/{len(issue_urls)}] Scraping: {url}")
    
    try:
        if "the-batch/issue-" in url:
            articles = scraper.scrape_issue_article(url)
        else:
            articles = scraper.scrape_simple_article(url)
        results.extend(articles)

    except Exception as e:
        logging.error(f"Error scraping {url}: {e}")
        print(f"❌ Failed: {url}")
        failed_urls.append(url)

    # Respectful crawling delay
    time.sleep(1.5)  # adjust to 2–3 seconds if you're hitting rate limits

# Save all successfully scraped articles
with open("scraped_articles.json", "w", encoding="utf-8") as out_f:
    json.dump(results, out_f, ensure_ascii=False, indent=2)

# Save failed URLs for retry/debug
if failed_urls:
    with open("failed_urls.json", "w", encoding="utf-8") as failed_f:
        json.dump(failed_urls, failed_f, ensure_ascii=False, indent=2)

print("✅ Done scraping. Total articles:", len(results))
if failed_urls:
    print("⚠️ Some URLs failed. Check failed_urls.json and scrape_errors.log.")



pub_date_elem: <div class="mt-1 text-slate-600 text-base text-sm">Mar 12, 2025</div>
pub_date_elem: <div class="mt-1 text-slate-600 text-base text-sm">Mar 12, 2025</div>
pub_date_elem: <div class="mt-1 text-slate-600 text-base text-sm">Mar 12, 2025</div>
pub_date_elem: <div class="mt-1 text-slate-600 text-base text-sm">Mar 12, 2025</div>
pub_date_elem: <div class="mt-1 text-slate-600 text-base text-sm">Oct 23, 2019</div>
pub_date_elem: <div class="mt-1 text-slate-600 text-base text-sm">Oct 23, 2019</div>
pub_date_elem: <div class="mt-1 text-slate-600 text-base text-sm">Oct 23, 2019</div>
pub_date_elem: <div class="mt-1 text-slate-600 text-base text-sm">Oct 23, 2019</div>
pub_date_elem: <div class="mt-1 text-slate-600 text-base text-sm">Oct 23, 2019</div>
pub_date_elem: <div class="mt-1 text-slate-600 text-base text-sm">Oct 23, 2019</div>
pub_date_elem: <div class="mt-1 text-slate-600 text-base text-sm">Oct 23, 2019</div>
pub_date_elem: <div class="mt-1 text-slate-600 text-base text-sm"

In [4]:
scraper = BatchScraper()
with open("scraped_articles.json", "r", encoding="utf-8") as f:
    scraped_articles = json.load(f)
scraper.download_images(scraped_articles, img_dir='imgages')

ERROR:root:Error downloading image https://cdn2.hubspot.net/hub/5871640/hubfs/ISOMORPHIC.gif?upscale=true&width=1200&upscale=true&name=ISOMORPHIC.gif: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
ERROR:root:Error downloading image https://info.deeplearning.ai/hs-fs/hubfs/ezgif.com-gif-maker%20-%202021-06-15T133323.218.gif?width=1200&upscale=true&name=ezgif.com-gif-maker%20-%202021-06-15T133323.218.gif: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
ERROR:root:Error downloading image https://cdn2.hubspot.net/hub/5871640/hubfs/ezgif.com-gif-maker%20-%202021-05-19T123452.995.gif?upscale=true&width=1200&upscale=true&name=ezgif.com-gif-maker%20-%202021-05-19T123452.995.gif: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
ERROR:root:Error downloading image https://info.deeplearning.ai/hs-fs/hubfs/CHIPS.gif?width=1200&upscale=true&name=CHIPS.gif: ('Connect

## data preprocessing

open ai clip vit 32 (little bit week and can only proceed 77 tocken)

In [7]:
import re
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import numpy as np
import torch
from io import BytesIO
import requests



class TextPreprocessor:
    def __init__(self):
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    def clean_text(self, text):
        """Clean text from HTML, emojis, extra spaces, and known noise"""
        # Remove HTML tags
        text = re.sub(r'<[^<]+?>', '', text)
        # Remove known repetitive marketing phrases
        text = re.sub(r'✨\s*New course!', '', text,flags=re.IGNORECASE)

        # Remove emojis
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # Emoticons
            u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # Transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # Flags
            "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text)
        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def get_clip_text_embedding(self, text):
        """Get CLIP text embedding for a given text (max 77 tokens)"""
        inputs = self.clip_processor(text=[text], return_tensors="pt", padding=True, truncation=True, max_length=77)
        with torch.no_grad():
            text_features = self.clip_model.get_text_features(**inputs)
        return text_features.numpy()

    @staticmethod
    def chunk_text_with_title(article_json, chunk_size=77, overlap=8, tokenizer=None):
        title = article_json.get("title", "")
        content = article_json.get("content", "").strip()

        content_tokens = tokenizer.encode(content, add_special_tokens=False)
        chunks = []
        start = 0

        while start < len(content_tokens):
            end = min(start + chunk_size, len(content_tokens))
            chunk_tokens = content_tokens[start:end]
            chunk_text = tokenizer.decode(chunk_tokens)

            # Додаємо title
            full_chunk = f"Title: {title}\nContent: {chunk_text}"

            # Перевіряємо загальну кількість токенів у цьому об'єднаному тексті
            tokenized_full = tokenizer(full_chunk, return_tensors="pt", truncation=True, max_length=77)
            input_ids = tokenized_full['input_ids'][0]

            # Якщо результат перевищує 77 навіть після обрізки — скоротимо chunk_text
            while len(input_ids) > 77 and len(chunk_tokens) > 5:
                chunk_tokens = chunk_tokens[:-1]
                chunk_text = tokenizer.decode(chunk_tokens)
                full_chunk = f"Title: {title}\nContent: {chunk_text}"
                input_ids = tokenizer(full_chunk, return_tensors="pt")['input_ids'][0]

            chunks.append(full_chunk)
            start += chunk_size - overlap

        return chunks




class ImagePreprocessor:
    def __init__(self):
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    
    def process_image(self, image):
        """Process image and extract features"""
        try:
            
            
            # Resize if too large
            if image.size[0] > 1024 or image.size[1] > 1024:
                image.thumbnail((1024, 1024), Image.Resampling.LANCZOS)
            
            # Extract CLIP features
            inputs = self.clip_processor(images=image, return_tensors="pt")
            
            with torch.no_grad():
                image_features = self.clip_model.get_image_features(**inputs)
                
            return image_features.numpy()
          
            
        except Exception as e:
            print(f"Error processing image {image}: {e}")
            return None
    def download_image_from_url(self,url):
        try:
            response = requests.get(url)
            response.raise_for_status()
            image = Image.open(BytesIO(response.content))
            return image
        except Exception as e:
            print(f"Error downloading image: {e}")
            return None



more powerful  laion2B-s32B-b79K

In [1]:
import re
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import numpy as np
import torch
from io import BytesIO
import requests


class TextPreprocessor:
    def __init__(self):
        self.clip_model = CLIPModel.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
        self.clip_processor = CLIPProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")

    def clean_text(self, text):
        """Clean text from HTML, emojis, extra spaces, and known noise"""
        # Remove HTML tags
        text = re.sub(r'<[^<]+?>', '', text)
        # Remove known repetitive marketing phrases
        text = re.sub(r'✨\s*New course!', '', text,flags=re.IGNORECASE)

        # Remove emojis
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # Emoticons
            u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # Transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # Flags
            "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text)
        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def get_clip_text_embedding(self, text):
        """Get CLIP text embedding for a given text (max 77 tokens)"""
        inputs = self.clip_processor(text=[text], return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            text_features = self.clip_model.get_text_features(**inputs)
        return text_features.numpy()

    @staticmethod
    def chunk_text_with_title(article_json, chunk_size=256, overlap=16, tokenizer=None):
        title = article_json.get("title", "")
        content = article_json.get("content", "").strip()

        content_tokens = tokenizer.encode(content, add_special_tokens=False)
        chunks = []
        start = 0

        while start < len(content_tokens):
            end = min(start + chunk_size, len(content_tokens))
            chunk_tokens = content_tokens[start:end]
            chunk_text = tokenizer.decode(chunk_tokens)

            # Додаємо title
            full_chunk = f"Title: {title}\nContent: {chunk_text}"

            # Перевіряємо загальну кількість токенів у цьому об'єднаному тексті
            tokenized_full = tokenizer(full_chunk, return_tensors="pt", truncation=True, max_length=77)
            input_ids = tokenized_full['input_ids'][0]

            # Якщо результат перевищує 77 навіть після обрізки — скоротимо chunk_text
            while len(input_ids) > 77 and len(chunk_tokens) > 5:
                chunk_tokens = chunk_tokens[:-1]
                chunk_text = tokenizer.decode(chunk_tokens)
                full_chunk = f"Title: {title}\nContent: {chunk_text}"
                input_ids = tokenizer(full_chunk, return_tensors="pt")['input_ids'][0]

            chunks.append(full_chunk)
            start += chunk_size - overlap

        return chunks




class ImagePreprocessor:
    def __init__(self):
        self.clip_model = CLIPModel.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
        self.clip_processor = CLIPProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
    
    def process_image(self, image):
        """Process image and extract features"""
        try:
            
            
            # Resize if too large
            if image.size[0] > 1024 or image.size[1] > 1024:
                image.thumbnail((1024, 1024), Image.Resampling.LANCZOS)
            
            # Extract CLIP features
            inputs = self.clip_processor(images=image, return_tensors="pt")
            
            with torch.no_grad():
                image_features = self.clip_model.get_image_features(**inputs)
                
            return image_features.numpy()
          
            
        except Exception as e:
            print(f"Error processing image {image}: {e}")
            return None
    def download_image_from_url(self,url):
        try:
            response = requests.get(url)
            response.raise_for_status()
            image = Image.open(BytesIO(response.content))
            return image
        except Exception as e:
            print(f"Error downloading image: {e}")
            return None



middle by power 

In [None]:
import re
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import numpy as np
import torch
from io import BytesIO
import requests


class TextPreprocessor:
    def __init__(self):
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14-336" )
        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14-336" )

    def clean_text(self, text):
        """Clean text from HTML, emojis, extra spaces, and known noise"""
        # Remove HTML tags
        text = re.sub(r'<[^<]+?>', '', text)
        # Remove known repetitive marketing phrases
        text = re.sub(r'✨\s*New course!', '', text,flags=re.IGNORECASE)

        # Remove emojis
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # Emoticons
            u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # Transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # Flags
            "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text)
        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def get_clip_text_embedding(self, text):
        """Get CLIP text embedding for a given text (max 77 tokens)"""
        inputs = self.clip_processor(text=[text], return_tensors="pt", padding=True, truncation=True, max_length=77)
        with torch.no_grad():
            text_features = self.clip_model.get_text_features(**inputs)
        return text_features.numpy()

    @staticmethod
    def chunk_text_with_title(article_json, chunk_size=256, overlap=32, tokenizer=None):
        title = article_json.get("title", "")
        content = article_json.get("content", "").strip()

        content_tokens = tokenizer.encode(content, add_special_tokens=False)
        chunks = []
        start = 0

        while start < len(content_tokens):
            end = min(start + chunk_size, len(content_tokens))
            chunk_tokens = content_tokens[start:end]
            chunk_text = tokenizer.decode(chunk_tokens)

            # Додаємо title
            full_chunk = f"Title: {title}\nContent: {chunk_text}"

            # Перевіряємо загальну кількість токенів у цьому об'єднаному тексті
            tokenized_full = tokenizer(full_chunk, return_tensors="pt", truncation=True)
            input_ids = tokenized_full['input_ids'][0]

            # Якщо результат перевищує 77 навіть після обрізки — скоротимо chunk_text
            while len(input_ids) > 77 and len(chunk_tokens) > 5:
                chunk_tokens = chunk_tokens[:-1]
                chunk_text = tokenizer.decode(chunk_tokens)
                full_chunk = f"Title: {title}\nContent: {chunk_text}"
                input_ids = tokenizer(full_chunk, return_tensors="pt")['input_ids'][0]

            chunks.append(full_chunk)
            start += chunk_size - overlap

        return chunks




class ImagePreprocessor:
    def __init__(self):
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14-336" )
        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14-336" )
    
    def process_image(self, image):
        """Process image and extract features"""
        try:
            
            
            # Resize if too large
            if image.size[0] > 1024 or image.size[1] > 1024:
                image.thumbnail((1024, 1024), Image.Resampling.LANCZOS)
            
            # Extract CLIP features
            inputs = self.clip_processor(images=image, return_tensors="pt")
            
            with torch.no_grad():
                image_features = self.clip_model.get_image_features(**inputs)
                
            return image_features.numpy()
          
            
        except Exception as e:
            print(f"Error processing image {image}: {e}")
            return None
    def download_image_from_url(self,url):
        try:
            response = requests.get(url)
            response.raise_for_status()
            image = Image.open(BytesIO(response.content))
            return image
        except Exception as e:
            print(f"Error downloading image: {e}")
            return None



In [2]:
import json

with open("scraped_articles.json", "r", encoding="utf-8") as f:
    scraped_articles = json.load(f)

textP = TextPreprocessor()



# Chunk із правильним токенайзером
chunks = TextPreprocessor.chunk_text_with_title(scraped_articles[0],tokenizer=textP.clip_processor.tokenizer)

embeddings = []
for chunk in chunks:
    chunk = textP.clean_text(chunk)
    embedding = textP.get_clip_text_embedding(chunk)
    print(f"Chunk: {chunk} \n Embedding shape: {embedding.shape}")
    embeddings.append(embedding)


Token indices sequence length is longer than the specified maximum sequence length for this model (627 > 77). Running this sequence through the model will result in indexing errors


Chunk: Title: Project Idea — A Car for DinosaursAI projects don’t need to have a meaningful deliverable. Lower the bar and do something creative. Content: enroll inorchestrating workflows for genai applications dear friends, a good way to get started in ai is to start with coursework, which gives a systematic way to gain knowledge, and then to work on projects. for many who hear this advice, “ projects ” may evoke a significant undertaking that delivers value to users. but i encourage you to set a lower bar and relish small, weekend tinkering projects that let you learn, even if they don ’ t result in a meaningful deliverable. recently, my son and daughter ( ages 3 and 5 ) were building lego vehicles. they built a beautiful ice - cream truck as well as a... umm... colorful and asymmetric dinosaur car, shown in the picture above. while most observers would judge the ice - cream truck as the superior creation, my kids built it by following lego ’ sinstructions, and it is likely identical

In [3]:
import requests
from io import BytesIO
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel
from IPython.display import display
preprocessor = ImagePreprocessor()

# Приклад використання
image_url = "https://www.deeplearning.ai/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%2Fimages%2F2024%2F05%2Funnamed---2024-05-22T145628.272-2.png&w=3840&q=75"  # замініть на ваш URL
image = preprocessor.download_image_from_url(image_url)
img_embedding = None
if image:
    preprocessor = ImagePreprocessor()
    result = preprocessor.process_image(image)
    if result.any():
        print("Embedding shape:", result.shape)
        img_embedding = result


Embedding shape: (1, 1024)


In [4]:
import torch

query_embedding = textP.get_clip_text_embedding("tell something about dinosaurs and lego cars and ai ")
query_embedding_tensor = torch.tensor(query_embedding)  # якщо numpy — перетворюємо в тензор
query_embedding_norm = torch.nn.functional.normalize(query_embedding_tensor, p=2, dim=1)
# Припустимо, img_embedding — numpy масив або тензор розміром [1, 512]
img_embedding_tensor = torch.tensor(img_embedding)  # якщо numpy — перетворюємо в тензор
img_embedding_norm = torch.nn.functional.normalize(img_embedding_tensor, p=2, dim=1)
print(f"similarity between query and image embedding: {torch.nn.functional.cosine_similarity(query_embedding_norm, img_embedding_norm).item()}")
for i, embedding in enumerate(embeddings):
    embedding_tensor = torch.tensor(embedding)
    embedding_norm = torch.nn.functional.normalize(embedding_tensor, p=2, dim=1)
    cos_sim = torch.nn.functional.cosine_similarity(embedding_norm, img_embedding_norm)
    print(f"Chunk {i} cosine similarity with image embedding: {cos_sim.item()}")



similarity between query and image embedding: 0.3655830919742584
Chunk 0 cosine similarity with image embedding: 0.2615649104118347
Chunk 1 cosine similarity with image embedding: 0.38722503185272217
Chunk 2 cosine similarity with image embedding: 0.3290191888809204


In [5]:
import torch

# Припустимо, img_embedding — numpy масив або тензор розміром [1, 512]
img_embedding_tensor = torch.tensor(img_embedding)  # якщо numpy — перетворюємо в тензор
img_embedding_norm = torch.nn.functional.normalize(img_embedding_tensor, p=2, dim=1)

for i, embedding in enumerate(embeddings):

    embedding_tensor_i = torch.tensor(embeddings[i])
    embedding_norm_i = torch.nn.functional.normalize(embedding_tensor_i, p=2, dim=1)
    embedding_tensor = torch.tensor(embeddings[0])
    embedding_norm = torch.nn.functional.normalize(embedding_tensor, p=2, dim=1)
    cos_sim = torch.nn.functional.cosine_similarity(embedding_norm, embedding_norm_i)
    print(f"Chunk {i} cosine similarity with image embedding: {cos_sim.item()}")    

Chunk 0 cosine similarity with image embedding: 1.0000001192092896
Chunk 1 cosine similarity with image embedding: 0.7452438473701477
Chunk 2 cosine similarity with image embedding: 0.8404936790466309


In [6]:
import json

# Завантаження файлу
with open("scraped_articles.json", "r", encoding="utf-8") as f:
    scraped_articles = json.load(f)

# Фільтрація — залишаємо лише ті, де title не дорівнює "Subscribe to The Batch"
scraped_articles = [article for article in scraped_articles if article.get("title") != "Subscribe to The Batch"]

# (Опційно) Збереження очищеного списку назад у файл
with open("scraped_articles_cleaned.json", "w", encoding="utf-8") as f:
    json.dump(scraped_articles, f, ensure_ascii=False, indent=2)


## vectorized store


In [3]:
import chromadb
from chromadb.config import Settings
import numpy as np
import json
import uuid
import torch
from typing import List, Dict, Any, Optional, Tuple

class MultimodalDB:
    def __init__(self, persist_directory: str = "./chroma_db"):
        """Initialize the MultimodalDB with text and image processors and vector store"""
        self.text_processor = TextPreprocessor()
        self.image_processor = ImagePreprocessor()
        self.client = chromadb.PersistentClient(path=persist_directory)
        
        # Create collections for text and image embeddings
        self.text_collection = self.client.get_or_create_collection(
            name="text_embeddings",
            metadata={"hnsw:space": "cosine"}
        )
        
        self.image_collection = self.client.get_or_create_collection(
            name="image_embeddings",
            metadata={"hnsw:space": "cosine"}
        )

    def process_article(self, article: Dict[str, Any]) -> Tuple[List[np.ndarray], List[np.ndarray], Dict[str, Any]]:
        """Process a single article to extract text and image embeddings"""
        # Process text
        text_embeddings = []
        chunks = self.text_processor.chunk_text_with_title(
            article, 
            tokenizer=self.text_processor.clip_processor.tokenizer
        )
        
        text_embeddings = []
        valid_chunks = []
        for chunk in chunks:
            chunk = self.text_processor.clean_text(chunk)
            embedding = self.text_processor.get_clip_text_embedding(chunk)
            if embedding is not None:
                text_embeddings.append(embedding)
                valid_chunks.append(chunk)

        # Process images
        image_embeddings = []
        for img_data in article.get('images', []):
            image = self.image_processor.download_image_from_url(img_data['url'])
            if image:
                embedding = self.image_processor.process_image(image)
                if embedding is not None:
                    image_embeddings.append(embedding)
                    img_data['processed'] = True
                else:
                    img_data['processed'] = False
            else:
                img_data['processed'] = False

        # Create metadata
        metadata = {
            'article_url': article.get('url', ''),
            'title': article.get('title', ''),
            'publication_date': article.get('publication_date', ''),
            'processed_at': article.get('scraped_at', '')
        }

        return text_embeddings, image_embeddings, metadata,valid_chunks

    def add_article(self, article: Dict[str, Any]) -> Dict[str, List[str]]:
        """Add an article to the database, processing both text and images"""
        text_embeddings, image_embeddings, metadata,chunks = self.process_article(article)
        
        # Add text embeddings
        text_ids = []
        for i, embedding in enumerate(text_embeddings):
            chunk_id = str(uuid.uuid4())
            chunk_metadata = {
                **metadata,
                'chunk_index': i,
                'total_chunks': len(text_embeddings),
                'chunk_text': chunks[i] if i < len(chunks) else ''
            }
            
            self.text_collection.add(
                embeddings=[embedding.flatten().tolist()],
                metadatas=[chunk_metadata],
                ids=[chunk_id],
                documents=[chunks[i]] 
            )
            text_ids.append(chunk_id)

        # Add image embeddings
        image_ids = []
        for i, embedding in enumerate(image_embeddings):
            image_id = str(uuid.uuid4())
            image_metadata = {
                **metadata,
                'image_index': i,
                'image_url': article['images'][i]['url'] if i < len(article['images']) else '',
                'total_images': len(image_embeddings)
            }
            
            self.image_collection.add(
                embeddings=[embedding.flatten().tolist()],
                metadatas=[image_metadata],
                ids=[image_id]
            )
            image_ids.append(image_id)

        return {
            'text_ids': text_ids,
            'image_ids': image_ids
        }

    def search(self, query: str, n_results: int = 5, modality: str = 'both') -> Dict[str, Any]:
        """Search the database using a text query"""
        # Get query embedding
        query_text_embedding = self.text_processor.get_clip_text_embedding(query)
        
        results = {}
        
        if modality in ['text', 'both']:
            # Search text collection
            text_results = self.text_collection.query(
                query_embeddings=[query_text_embedding.flatten().tolist()],
                n_results=n_results
            )
            results['text_results'] = text_results
        
        if modality in ['image', 'both']:
            # Search image collection using the same text embedding
            image_results = self.image_collection.query(
                query_embeddings=[query_text_embedding.flatten().tolist()],
                n_results=n_results
            )
            results['image_results'] = image_results
        
        return results

    def add_articles_batch(self, articles: List[Dict[str, Any]]) -> List[Dict[str, List[str]]]:
        """Process and add multiple articles in batch"""
        results = []
        for article in articles:
            try:
                article_ids = self.add_article(article)
                results.append(article_ids)
            except Exception as e:
                print(f"Error processing article {article.get('title', 'Unknown')}: {e}")
                results.append({'text_ids': [], 'image_ids': []})
                import traceback
                traceback.print_exc()
        return results

In [4]:
# Initialize the MultimodalDB
db = MultimodalDB(persist_directory="./multimodal_db")

In [5]:


# Load the cleaned articles
with open("scraped_articles_cleaned.json", "r", encoding="utf-8") as f:
    articles = json.load(f)

# Process articles in batch
print("Processing articles...")
results = db.add_articles_batch(articles[:50])  # Start with first 5 articles as a test
print(f"Processed {len(results)} articles")

# Try a search query
query = "AI and machine learning applications"
search_results = db.search(query, n_results=3)

print("\nText Results:")
for i, (doc_id, metadata) in enumerate(zip(search_results['text_results']['ids'], search_results['text_results']['metadatas'])):
   display(metadata)
print("\nImage Results:")
for i, (doc_id, metadata) in enumerate(zip(search_results['image_results']['ids'], search_results['image_results']['metadatas'])):
    display(metadata)


Token indices sequence length is longer than the specified maximum sequence length for this model (627 > 77). Running this sequence through the model will result in indexing errors


Processing articles...
Processed 50 articles

Text Results:


[{'article_url': 'https://www.deeplearning.ai/the-batch/issue-292',
  'chunk_index': 0,
  'processed_at': '2025-06-13 02:07:08.680546',
  'publication_date': 'Mar 12, 2025',
  'title': 'Judge Upholds Copyright in AI Training Case',
  'total_chunks': 2},
 {'article_url': 'https://www.deeplearning.ai/the-batch/issue-292',
  'chunk_index': 0,
  'chunk_text': "Title: Judge Upholds Copyright in AI Training Case Content: a united states court delivered a major ruling that begins to answer the question whether, and under what conditions, training an ai system on copyrighted material is considered fair use that doesn ’ t require permission. what ’ s new : a u. s. circuit judgeruledon a claim by the legal publisher thomson reuters that ross intelligence, an ai - powered legal research service, could not claim that training its ai system on materials owned by thomson reuters was a so - called “ fair use.” training the system did not qualify as fair use, he decided, because its output competed wi


Image Results:


[{'article_url': 'https://www.deeplearning.ai/the-batch/issue-xi',
  'image_index': 0,
  'processed_at': '2025-06-13 02:07:35.338519',
  'publication_date': 'Jun 26, 2019',
  'title': 'Between Consenting Electrons',
  'total_images': 1},
 {'article_url': 'https://www.deeplearning.ai/the-batch/issue-xi',
  'image_index': 0,
  'processed_at': '2025-06-13 02:07:35.338519',
  'publication_date': 'Jun 26, 2019',
  'title': 'Between Consenting Electrons',
  'total_images': 1},
 {'article_url': 'https://www.deeplearning.ai/the-batch/issue-xi',
  'image_index': 0,
  'image_url': 'https://dl-staging-website.ghost.io/content/images/2022/09/c55611c3-6f6f-4ef2-844d-ae193d57cfc4.png',
  'processed_at': '2025-06-13 02:07:35.338519',
  'publication_date': 'Jun 26, 2019',
  'title': 'Between Consenting Electrons',
  'total_images': 1}]

In [3]:
# Try a search query
db = MultimodalDB(persist_directory="./multimodal_db")

In [6]:

query = "tiny toy dinosaur on lego car" 
search_results = db.search(query, n_results=3)

print("\nText Results:")
for i, (doc_id, metadata) in enumerate(zip(search_results['text_results']['ids'], search_results['text_results']['metadatas'])):
   display(metadata)
print("\nImage Results:")
for i, (doc_id, metadata) in enumerate(zip(search_results['image_results']['ids'], search_results['image_results']['metadatas'])):
    display(metadata)


Text Results:


[{'article_url': 'https://www.deeplearning.ai/the-batch/project-idea-a-car-for-dinosaurs',
  'chunk_index': 1,
  'processed_at': '2025-06-13 02:06:44.103029',
  'publication_date': 'May 22, 2024',
  'title': 'Project Idea — A Car for DinosaursAI projects don’t need to have a meaningful deliverable. Lower the bar and do something creative.',
  'total_chunks': 3},
 {'article_url': 'https://www.deeplearning.ai/the-batch/project-idea-a-car-for-dinosaurs',
  'chunk_index': 1,
  'chunk_text': 'Title: Project Idea — A Car for DinosaursAI projects don’t need to have a meaningful deliverable. Lower the bar and do something creative. Content: ’ designs ( with permission ) and coming up with your own. as a parent, i try to celebrate both. ( to be honest, i celebrated the dinosaur car more.) when learning to build lego, it ’ s helpful to start by following a template. but eventually, building your own unique projects enriches your skills. as a developer, too, i try to celebrate unique creations. y


Image Results:


[{'article_url': 'https://www.deeplearning.ai/the-batch/issue-230',
  'image_index': 0,
  'image_url': 'https://dl-staging-website.ghost.io/content/images/2024/01/EVILGPT_1200px-1.gif',
  'processed_at': '2025-06-13 02:07:49.940877',
  'publication_date': 'Jan 3, 2024',
  'title': 'GPT-4 Wouldn’t Lie to Me . . . Would It?',
  'total_images': 1},
 {'article_url': 'https://www.deeplearning.ai/the-batch/issue-230',
  'image_index': 0,
  'processed_at': '2025-06-13 02:07:49.940877',
  'publication_date': 'Jan 3, 2024',
  'title': 'GPT-4 Wouldn’t Lie to Me . . . Would It?',
  'total_images': 1},
 {'article_url': 'https://www.deeplearning.ai/the-batch/issue-230',
  'image_index': 0,
  'processed_at': '2025-06-13 02:07:49.940877',
  'publication_date': 'Jan 3, 2024',
  'title': 'GPT-4 Wouldn’t Lie to Me . . . Would It?',
  'total_images': 1}]

## retrive

In [7]:

import json
from typing import List, Dict, Any

class MultimodalRetriever:
    def __init__(self, vector_store: MultimodalDB):
        self.vector_store = vector_store
    
    def retrieve(self, query: str, n_results: int = 5) -> Dict[str, Any]:
        """Retrieve relevant content for a given query"""
        
        # Perform hybrid search
        search_results = self.vector_store.search(query, n_results)
        with open("retrieval_results.json", "w", encoding="utf-8") as f:
            json.dump(search_results, f, ensure_ascii=False, indent=2)
        
        # Process and rank results
        processed_results = self.process_search_results(search_results, query, n_results)
        
        return processed_results
    
    def process_search_results(self, search_results: Dict, query: str,n_results:int) -> Dict[str, Any]:
        """Process and structure search results"""
        
        text_results = search_results.get('text_results', {})
        image_results = search_results.get('image_results', {})
        
        # Structure text results
        text_chunks = []
        if text_results.get('documents'):
            for i, (doc, metadata, distance) in enumerate(zip(
                text_results['documents'][0],
                text_results['metadatas'][0],
                text_results['distances'][0]
            )):
                text_chunks.append({
                    'content': doc,
                    'metadata': metadata,
                    'relevance_score': 1 - distance,  # Convert distance to similarity
                    'type': 'text'
                })
        
        # Structure image results
        image_items = []
        if image_results.get('metadatas'):
            for metadata, distance in zip(
                image_results['metadatas'][0],
                image_results['distances'][0]
            ):
                image_items.append({
                    'metadata': metadata,
                    'relevance_score': 1 - distance,
                    'type': 'image'
                })
        
        # Combine and sort by relevance
        all_results = text_chunks + image_items
        all_results.sort(key=lambda x: x['relevance_score'], reverse=True)
        
        return {
            'query': query,
            'results': all_results[:n_results],
            'total_found': len(all_results)
        }
    
    def get_context_for_llm(self, retrieval_results: Dict[str, Any]) -> str:
        """Format retrieval results as context for LLM"""
        
        context_parts = []
        context_parts.append(f"Query: {retrieval_results['query']}\n")
        context_parts.append("Relevant Information:\n")
        
        for i, result in enumerate(retrieval_results['results'][:5], 1):
            if result['type'] == 'text':
                context_parts.append(f"{i}. {result['content']}")
                if 'title' in result['metadata']:
                    context_parts.append(f"   Source: {result['metadata']['title']}")
            elif result['type'] == 'image':
                context_parts.append(f"{i}. Image: {result['metadata'].get('description', 'No description')}")
                if 'alt' in result['metadata']:
                    context_parts.append(f"   Alt text: {result['metadata']['alt']}")
                    
            context_parts.append("")  # Empty line for separation
        
        return "\n".join(context_parts)

In [8]:
retrive= MultimodalRetriever(vector_store=db)
query= "tell something about dinosaurs and lego cars and ai "
retrivial=retrive.retrieve(query, n_results=3)
print("Retrieval Results:")

print(retrivial)
context=retrive.get_context_for_llm(retrivial)
print("\nContext for LLM:")
print(context)

Retrieval Results:
{'query': 'tell something about dinosaurs and lego cars and ai ', 'results': [{'content': None, 'metadata': {'article_url': 'https://www.deeplearning.ai/the-batch/project-idea-a-car-for-dinosaurs', 'chunk_index': 1, 'processed_at': '2025-06-13 02:06:44.103029', 'publication_date': 'May 22, 2024', 'title': 'Project Idea — A Car for DinosaursAI projects don’t need to have a meaningful deliverable. Lower the bar and do something creative.', 'total_chunks': 3}, 'relevance_score': 0.6465099453926086, 'type': 'text'}, {'content': 'Title: Project Idea — A Car for DinosaursAI projects don’t need to have a meaningful deliverable. Lower the bar and do something creative. Content: ’ designs ( with permission ) and coming up with your own. as a parent, i try to celebrate both. ( to be honest, i celebrated the dinosaur car more.) when learning to build lego, it ’ s helpful to start by following a template. but eventually, building your own unique projects enriches your skills. as

## llm integration

In [None]:
from anthropic import Anthropic
import os
from openai import OpenAI
from typing import Dict, List, Optional
from dotenv import load_dotenv


class LLMIntegration:
    def __init__(self, provider="openai", model="gpt-3.5-turbo"):
        load_dotenv()
        self.provider = provider
        self.model = model
        
        if provider == "openai":
            self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        elif provider == "anthropic":
            self.anthropic = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
    
    def generate_response(self, query: str, context: str, max_tokens: int = 500) -> str:
        """Generate response using LLM"""
        
        system_prompt = """You are an AI assistant that helps users find information from The Batch news articles. 
        Use the provided context to answer questions accurately. If the context doesn't contain enough information 
        to answer the question, say so clearly. Always cite which articles or sources you're referencing."""
        
        user_prompt = f"""
        Context from The Batch articles:
        {context}
        
        User Question: {query}
        
        Please provide a comprehensive answer based on the context above.
        """
        
        if self.provider == "openai":
            return self._generate_openai_response(system_prompt, user_prompt, max_tokens)
        elif self.provider == "anthropic":
            return self._generate_anthropic_response(system_prompt, user_prompt, max_tokens)
        else:
            return "LLM provider not supported."
    
    def _generate_openai_response(self, system_prompt: str, user_prompt: str, max_tokens: int) -> str:
        """Generate response using OpenAI"""
        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                max_tokens=max_tokens,
                temperature=0.3
            )
            return response.choices[0].message.content
        except Exception as e:
            return f"Error generating response: {str(e)}"
    
    def _generate_anthropic_response(self, system_prompt: str, user_prompt: str, max_tokens: int) -> str:
        """Generate response using Anthropic Claude"""
        try:
            response = self.anthropic.messages.create(
                model="claude-3-sonnet-20240229",
                max_tokens=max_tokens,
                temperature=0.3,
                system=system_prompt,
                messages=[
                    {"role": "user", "content": user_prompt}
                ]
            )
            return response.content[0].text
        except Exception as e:
            return f"Error generating response: {str(e)}"
    
    def summarize_article(self, article_content: str) -> str:
        """Generate summary of an article"""
        prompt = f"""
        Please provide a concise summary of the following article from The Batch:
        
        {article_content}
        
        Summary should be 2-3 sentences highlighting the key points.
        """
        
        return self.generate_response("Summarize this article", prompt, max_tokens=200)

In [None]:
llm= LLMIntegration()
llm_response = llm.generate_response(query, context)

In [None]:
print("\nLLM Response:")
print(llm_response)


LLM Response:
Based on the context provided from The Batch article "Project Idea — A Car for Dinosaurs," the article discusses the creative aspect of AI projects and the process of building unique projects using LEGO cars as an analogy. The author emphasizes the importance of celebrating unique creations in both LEGO building and software development. They mention how starting with templates is helpful but building one's own projects enriches skills in both LEGO and software development.

Furthermore, the author shares their personal experience of tinkering with AI applications, highlighting that not all projects need to have a meaningful deliverable. They mention an example of building a Streamlit app that interacts with Google Docs using a large language model, even though the project did not result in a useful outcome due to user interface issues.

In summary, the article encourages creativity and experimentation in AI projects, drawing parallels between building LEGO cars, celebra