In [None]:
import re
import pandas as pd
from typing import List, Dict, Optional
from datetime import datetime, timezone, timedelta
import time
import logging
import hashlib
from urllib.parse import urlparse
import feedparser
from bs4 import BeautifulSoup
import requests
from collections import Counter
import spacy


In [None]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('news_scraper.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Load English language model for NLP
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    logger.error("Spacy model 'en_core_web_sm' not found. Please install it first.")
    logger.info("Run: python -m spacy download en_core_web_sm")
    nlp = None

class NewsScraper:
    """A robust news scraper that collects articles from multiple RSS feeds and websites."""
####################################################################################################################################################









####################################################################################################################################################
    
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'TE': 'Trailers'
        }
        
        # Configure requests session
        self.session = requests.Session()
        self.session.headers.update(self.headers)
        self.session.max_redirects = 5
        self.timeout = 15
        
        # News sources configuration
        self.news_sources = {
            'Hindustan Times': {
                'rss': 'https://www.hindustantimes.com/feeds/rss/latest-news/rssfeed.xml',
                'web': 'https://www.hindustantimes.com/latest-news',
                'language': 'en',
                'country': 'India'
            },
            'The Hindu': {
                'rss': 'https://www.thehindu.com/feeder/default.rss',
                'web': 'https://www.thehindu.com/latest-news/',
                'language': 'en',
                'country': 'India'
            },
            'Indian Express': {
                'rss': 'https://indianexpress.com/feed/',
                'web': 'https://indianexpress.com/latest-news/',
                'language': 'en',
                'country': 'India'
            },
            'BBC': {
                'rss': 'http://feeds.bbci.co.uk/news/rss.xml',
                'web': 'https://www.bbc.com/news',
                'language': 'en',
                'country': 'UK'
            },
            'CNN': {
                'rss': 'http://rss.cnn.com/rss/cnn_latest.rss',
                'web': 'https://edition.cnn.com/',
                'language': 'en',
                'country': 'USA'
            },
            'Reuters': {
                'rss': 'https://www.reutersagency.com/feed/?best-topics=tech&post_type=best',
                'web': 'https://www.reuters.com/',
                'language': 'en',
                'country': 'International'
            },
            'Al Jazeera': {
                'rss': 'https://www.aljazeera.com/xml/rss/all.xml',
                'web': 'https://www.aljazeera.com/news/',
                'language': 'en',
                'country': 'Qatar'
            },
            'Money Control': {
                'rss': 'https://www.moneycontrol.com/rss/latestnews.xml',
                'web': 'https://www.moneycontrol.com/news/',
                'language': 'en',
                'country': 'India'
            },
            'Aaj Tak': {
                'rss': 'https://www.aajtak.in/feeds/default.rss',
                'web': 'https://www.aajtak.in/',
                'language': 'hi',
                'country': 'India'
            },
            'Zee News': {
                'rss': 'https://zeenews.india.com/rss/india-national-news.xml',
                'web': 'https://zeenews.india.com/',
                'language': 'hi',
                'country': 'India'
            },
            'News18': {
                'rss': 'https://www.news18.com/rss/india.xml',
                'web': 'https://www.news18.com/',
                'language': 'en',
                'country': 'India'
            }
        }
        
        self.cache = set()

    def _get_domain(self, url: str) -> str:
        """Extract domain from URL."""
        parsed = urlparse(url)
        return parsed.netloc.replace('www.', '')

    def _make_request(self, url: str, max_retries: int = 3) -> Optional[requests.Response]:
        """Make HTTP request with retries."""
        for attempt in range(max_retries):
            try:
                response = self.session.get(url, timeout=self.timeout)
                response.raise_for_status()
                if response.status_code == 200:
                    return response
            except requests.exceptions.RequestException as e:
                if attempt == max_retries - 1:
                    logger.error(f"Request failed for {url}: {str(e)}")
                time.sleep(2 ** attempt)
        return None

    def _parse_date(self, date_str: str) -> datetime:
        """Parse various date formats."""
        if not date_str:
            return datetime.min.replace(tzinfo=timezone.utc)
        
        formats = [
            '%Y-%m-%dT%H:%M:%S%z',
            '%a, %d %b %Y %H:%M:%S %z',
            '%Y-%m-%d %H:%M:%S',
            '%a, %d %b %Y %H:%M:%S GMT',
            '%d %b %Y %H:%M:%S GMT',
            '%B %d, %Y %I:%M %p %Z',
            '%Y-%m-%d'
        ]
        
        for fmt in formats:
            try:
                dt = datetime.strptime(date_str, fmt)
                if dt.tzinfo is None:
                    return dt.replace(tzinfo=timezone.utc)
                return dt.astimezone(timezone.utc)
            except ValueError:
                continue
        
        logger.warning(f"Failed to parse date: {date_str}")
        return datetime.min.replace(tzinfo=timezone.utc)

    def _analyze_text(self, text: str) -> Dict[str, float]:
        """Perform NLP analysis on text to extract key topics."""
        if not text or not nlp:
            return {}
            
        doc = nlp(text.lower())
        
        # Extract relevant nouns and proper nouns
        nouns = [
            token.lemma_ for token in doc 
            if token.pos_ in ["NOUN", "PROPN"] 
            and not token.is_stop
            and len(token.text) > 2
        ]
        
        # Count noun frequencies and calculate weights
        noun_counts = Counter(nouns)
        total_nouns = max(1, sum(noun_counts.values()))  # Avoid division by zero
        
        return {
            noun: count/total_nouns 
            for noun, count in noun_counts.most_common(10)
        }

    def fetch_article_content(self, url: str) -> Dict[str, str]:
        """Extract full article content including paragraphs and images."""
        domain = self._get_domain(url)
        response = self._make_request(url)
        
        if not response:
            return {
                'summary': "",
                'image': None,
                'paragraphs': []
            }
            
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Domain-specific content selectors
        content_selectors = {
            'hindustantimes.com': 'div.story-details',
            'thehindu.com': 'div.articlebodycontent',
            'indianexpress.com': 'div.main-content',
            'bbc.com': 'div.article-body',
            'cnn.com': 'div.article__content',
            'reuters.com': 'div.article-body__content',
            'aljazeera.com': 'div.wysiwyg--all-content',
            'moneycontrol.com': 'div.article_section',
            'aajtak.in': 'div.story-with-main-sec',
            'zeenews.india.com': 'div.section-article',
            'news18.com': 'article_story_content'
        }
        
        # Extract paragraphs
        paragraphs = []
        content_selector = content_selectors.get(domain, 'body')
        
        try:
            content = soup.select_one(content_selector) or soup
            for p in content.find_all('p'):
                text = p.get_text(strip=True)
                if text and len(text.split()) > 5:
                    paragraphs.append(text)
        except Exception as e:
            logger.error(f"Error extracting paragraphs: {e}")
        
        # Extract main image
        image = None
        try:
            # Try OpenGraph image first
            og_image = soup.find('meta', property='og:image')
            if og_image and og_image.get('content'):
                image = og_image['content']
            else:
                # Fallback to first large image in content
                for img in soup.find_all('img'):
                    if int(img.get('width', 0)) > 300 or int(img.get('height', 0)) > 200:
                        image = img.get('src')
                        break
        except Exception as e:
            logger.error(f"Error extracting image: {e}")
        
        # Format the full content
        full_content = '\n\n'.join(paragraphs) if paragraphs else ""
        
        return {
            'summary': full_content[:2000] + ('...' if len(full_content) > 2000 else ''),
            'image': image,
            'paragraphs': paragraphs
        }

    def _generate_article_id(self, url: str, title: str) -> str:
        """Generate unique article ID."""
        return hashlib.md5(f"{url}_{title}".encode()).hexdigest()

    def fetch_rss_news(self, source_name: str, limit: int = 5) -> List[Dict]:
        """Fetch and process RSS feed entries."""
        if source_name not in self.news_sources:
            return []
            
        rss_url = self.news_sources[source_name]['rss']
        logger.info(f"Fetching {source_name} RSS feed")
        
        try:
            feed = feedparser.parse(rss_url)
            news_items = []
            
            for entry in feed.entries[:limit]:
                article_id = self._generate_article_id(entry.link, entry.title)
                if article_id in self.cache:
                    continue
                    
                self.cache.add(article_id)
                
                # Skip entries with "Today's news in 10 minutes" in title
                if "Today's news in 10 minutes" in entry.title:
                    continue
                
                # Get full article content
                article_content = self.fetch_article_content(entry.link)
                
                news_items.append({
                    'id': article_id,
                    'title': entry.title,
                    'link': entry.link,
                    'published': entry.get('published', ''),
                    'source': source_name,
                    'language': self.news_sources[source_name].get('language', 'en'),
                    'country': self.news_sources[source_name].get('country', 'Unknown'),
                    'summary': article_content['summary'],
                    'paragraphs': article_content['paragraphs'],
                    'image': article_content['image'],
                })

                
                time.sleep(0.5)  # Rate limiting
                
            return news_items
            
        except Exception as e:
            logger.error(f"Error processing {source_name} feed: {e}")
            return []

    def get_all_news(self, limit_per_source: int = 5) -> List[Dict]:
        """Aggregate news from all sources."""
        all_news = []
        for source in self.news_sources:
            try:
                all_news.extend(self.fetch_rss_news(source, limit_per_source))
            except Exception as e:
                logger.error(f"Error processing {source}: {e}")
                
        # Sort by publication date (newest first)
        all_news.sort(
            key=lambda x: self._parse_date(x.get('published', '')),
            reverse=True
        )
        return all_news

class NewsProcessor:
    """Processes and structures news data with built-in text cleaning and merging."""
    
    def __init__(self, scraper: NewsScraper):
        self.scraper = scraper
        self.df = pd.DataFrame(columns=[
            'Article_ID',
            'News',  # This will be our merged column
            'Source',
            'Language',
            'Country',
            'Published_Time',
            'Access_Time',
            'URL',
            'Image_URL',
        ])
    
    def _clean_and_merge_text(self, df: pd.DataFrame) -> pd.DataFrame:
        """Clean text columns and merge them into a single 'News' column."""
        if df.empty:
            return df
            
        # Define phrases to look for
        ignore_phrases = [
            "Stay updated with the latest",
            "Get the latest news",
            "Read more at",
            "For more updates",
            "To learn more",
            "Click here",
            "Disclaimer:",
            "For more lifestyle news",
            'A post shared by',
            'Clickhere to follow',
            '#WATCH',
                    ]
        
        # Text columns to process
        text_columns = ['Headline', 'Full_Content', 'Summary']
        
        # Compile regex pattern once
        pattern = re.compile(rf'({"|".join(re.escape(p) for p in ignore_phrases)}).*', flags=re.IGNORECASE)
        
        # Clean all text columns
        for col in text_columns:
            if col in df.columns:
                df[col] = (
                    df[col]
                    .replace(["NaN", pd.NA, None], " ")
                    .fillna(" ")
                    .astype(str)
                    .apply(lambda x: re.sub(pattern, "", x).strip())
                )
        
        # Merge columns into single 'News' column
        df["News"] = df[text_columns].agg(" ".join, axis=1)
        
        # Remove content after 3 or more consecutive line breaks
        df["News"] = df["News"].apply(lambda x: re.split(r'\n{3,}', x)[0].strip())
        
        return df

    def _generate_summary(self, paragraphs: List[str], max_paragraphs: int = 3) -> str:
        """Generate a summary from article paragraphs."""
        if not paragraphs:
            return ""
            
        summary_paragraphs = paragraphs[:max_paragraphs]
        summary = ' '.join(p.strip() for p in summary_paragraphs)
        return summary[:500] + ('...' if len(summary) > 500 else '')

    def _convert_to_ist(self, dt: datetime) -> datetime:
        """Convert datetime to Indian Standard Time (UTC+5:30)."""
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        return dt.astimezone(timezone(timedelta(hours=5, minutes=30)))

    def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
        """Clean the dataframe by removing unwanted rows and empty content."""
        # Remove rows where Headline contains "Today's news in 10 minutes"
        if 'Headline' in df.columns:
            df = df[~df['Headline'].str.contains("Today's news in 10 minutes", case=False, na=False)]
        
        # Remove rows with empty News content
        if 'News' in df.columns:
            df = df[df['News'].str.strip().astype(bool)]
        
        return df

    def update_dataframe(self, news_items: List[Dict]) -> pd.DataFrame:
        """Update the DataFrame with new articles, clean text, and merge columns."""
        new_entries = []
        current_time = self._convert_to_ist(datetime.now(timezone.utc))
        
        for item in news_items:
            published_time = self._convert_to_ist(self.scraper._parse_date(item.get('published', '')))
            
            # Create entry with all original columns first
            entry = {
                'Article_ID': item['id'],
                'Headline': item['title'],
                'Full_Content': '\n\n'.join(item['paragraphs']),
                'Summary': self._generate_summary(item['paragraphs']),
                'Source': item['source'],
                'Language': item['language'],
                'Country': item['country'],
                'Published_Time': published_time,
                'Access_Time': current_time,
                'URL': item['link'],
                'Image_URL': item['image'],
            }
            new_entries.append(entry)
        
        if new_entries:
            new_df = pd.DataFrame(new_entries)
            # Clean and merge text columns
            new_df = self._clean_and_merge_text(new_df)
            # Remove temporary columns
            new_df.drop(columns=['Headline', 'Full_Content', 'Summary'], inplace=True, errors='ignore')
            # Apply additional cleaning
            new_df = self._clean_dataframe(new_df)
            # Merge with existing data
            self.df = pd.concat([self.df, new_df]).drop_duplicates('Article_ID')
            
        return self.df

    def get_latest_news_df(self, limit: int = 10) -> pd.DataFrame:
        """Get latest news and update DataFrame."""
        news_items = self.scraper.get_all_news(limit_per_source=max(1, limit//len(self.scraper.news_sources)))
        return self.update_dataframe(news_items[:limit])

class ContinuousNewsScraper:
    def __init__(self, scrape_interval_minutes=2):
        self.scraper = NewsScraper()
        self.processor = NewsProcessor(self.scraper)
        self.scrape_interval = scrape_interval_minutes * 60  # Convert to seconds
        self.df_news = pd.DataFrame()  # Master DataFrame
        self.seen_article_ids = set()  # Track duplicates

    def get_new_articles(self, limit=20):
        """Fetch new articles, excluding duplicates."""
        new_news = self.processor.get_latest_news_df(limit=limit)
        
        if new_news.empty:
            return pd.DataFrame()  # No new articles
        
        # Filter out already-seen articles
        new_news = new_news[~new_news['Article_ID'].isin(self.seen_article_ids)]
        
        if not new_news.empty:
            # Update seen articles
            self.seen_article_ids.update(new_news['Article_ID'].tolist())
            return new_news
        return pd.DataFrame()

    def run_continuous_scraping(self, max_iterations=None):
        """Run continuous scraping, appending only new news."""
        iteration = 0
        while True:
            if max_iterations and iteration >= max_iterations:
                break
            
            iteration += 1
            print(f"\n=== Iteration {iteration} | {datetime.now().strftime('%H:%M:%S')} ===")
            
            try:
                new_articles = self.get_new_articles(limit=20)
                
                if not new_articles.empty:
                    self.df_news = pd.concat([self.df_news, new_articles], ignore_index=True)
                    print(f"✅ Added {len(new_articles)} new articles.")
                    print("Latest news:")
                    print(new_articles[['Headline', 'Source', 'Published_Time']].tail(3))
                else:
                    print("🔄 No new articles found.")
                
                # Display stats
                print(f"\n📊 Total articles: {len(self.df_news)}")
                
            except Exception as e:
                print(f"❌ Error: {e}")
            
            time.sleep(self.scrape_interval)
        
        return self.df_news

# if __name__ == "__main__":
#     scraper = ContinuousNewsScraper(scrape_interval_minutes=2)
    
#     try:
#         # Run for 10 iterations (20 minutes)
#         final_df = scraper.run_continuous_scraping(max_iterations=20)
        
#         # Save final results (optional)
#         final_df.to_csv("latest_news_archive.csv", index=False)
#         print("\n✅ Scraping completed. Data saved to 'latest_news_archive.csv'")
    
#     except KeyboardInterrupt:
#         print("\n🛑 Manual stop detected. Saving current data...")
#         final_df = scraper.df_news
#         final_df.to_csv("partial_news_archive.csv", index=False)
#         print("💾 Partial data saved to 'partial_news_archive.csv'")

In [None]:
if __name__ == "__main__":
    scraper = ContinuousNewsScraper(scrape_interval_minutes=2)  # Initialize scraper
    
    try:
        # Run for **1 iteration only** (2 minutes)
        final_df = scraper.run_continuous_scraping(max_iterations=0)  # Typo fixed: max_iterations
        
        # Save results
        # final_df.to_csv("latest_news_archive.csv", index=False)
        # print("\n✅ Scraping completed (single run). Data saved to 'latest_news_archive.csv'")
        # print(final_df.head())  # Preview instead of full print
        
    except KeyboardInterrupt:
        # print("\n🛑 Manual stop detected. Saving partial data...")
        final_df = scraper.df_news
        final_df.to_csv("partial_news_archive.csv", index=False)
        # print("💾 Partial data saved to 'partial_news_archive.csv'")
    
    print("\n⏭️ Ready for the next step!")  # Explicit breakpoint

In [1]:
API_Key = "AIzaSyCO5DDWVA2gOUKsI3eJzu4tHo_yitnpWLU"

In [None]:
import google.generativeai as genai
import pandas as pd
import re
import time
import requests
from io import BytesIO
from PIL import Image, ImageDraw, ImageFont
import textwrap
import random
from urllib.parse import urlparse

# Configuration
API_KEY = API_Key  # Replace with your actual API key
TEMPLATE_FOLDER = r"E:/Intapost_Templates/Square Templetes 2/"
LOGO_PATH = r"E:\Intapost_Templates\Square Templetes 2\9.png"

# Gemini AI Initialization
def initialize_gemini(api_key):
    """Initialize Gemini AI with API key and model selection."""
    genai.configure(api_key=api_key)

    model_names_to_try = [
        'gemini-1.5-flash',
        'gemini-1.5-pro',
        'gemini-pro'
    ]
    
    for model_name in model_names_to_try:
        try:
            model = genai.GenerativeModel(model_name)
            response = model.generate_content("Hello")
            if response.text:
                print(f"✅ Using model: {model_name}")
                return model
        except Exception as e:
            print(f"⚠️ Model {model_name} failed: {str(e)}")
    
    raise ValueError("❌ No working model found. Check API key/model availability.")

# Initialize model globally
try:
    model = initialize_gemini(API_KEY)
except Exception as e:
    print(f"Initialization failed: {e}")
    exit()

# News Processing
ENFORCED_PROMPT = """
Transform news snippets into concise 60-word articles with:

1. STRUCTURE:
Headline: [5-12 words with key terms]
Category: [Predefined category]
State: [Indian state or 'National' or 'International']
Subheading: [Core news in one sentence]
Content: [All key details within 60 words total]

2. CATEGORIES:
- Crime/Legal
- Politics/Government
- Business/Economy
- Health/Medicine
- Education/Research
- Technology/Science
- Environment/Climate
- International
- Human Interest
- Sports/Entertainment

3. RULES:
- Bold key terms: **Rs 500 crore scam**
- Preserve ALL critical facts (names, figures, locations)
- Neutral tone for news, analytical for opinions
- For investigations: Highlight methods → findings → consequences
- For opinions: Start with "PERSPECTIVE:"
"""

def process_news_dataframe(df):
    """End-to-end news processing pipeline."""
    
    def identify_state(text):
        try:
            response = model.generate_content(
                f"Analyze text and return ONLY:\nState: [Indian state or 'National' or 'International']\n\nText: {text}"
            )
            return response.text.strip() if response.text else "National"
        except:
            return "National"

    def identify_category(text):
        try:
            response = model.generate_content(
                f"Classify into: Crime/Legal, Politics/Government, Business/Economy, Health/Medicine, Education/Research, Technology/Science, Environment/Climate, International, Human Interest, Sports/Entertainment. Reply ONLY with category.\n\nText: {text}"
            )
            return response.text.strip() if response.text else "General"
        except:
            return "General"

    def enforce_word_limit(text, limit=60):
        return ' '.join(text.split()[:limit])

    def transform_snippet(snippet):
        try:
            category = identify_category(snippet)
            state = identify_state(snippet)
            time.sleep(1)  # Rate limiting

            response = model.generate_content(
                f"{ENFORCED_PROMPT}\n\nInput News:\n{snippet}\n\nCategory: {category}\nState: {state}"
            )
            time.sleep(1)

            if not response.text:
                return "Headline: \nCategory: \nState: \nSubheading: \nContent: "

            structured = response.text
            if "Content:" in structured:
                parts = structured.split("Content:")
                content = enforce_word_limit(parts[1].strip())
                return f"{parts[0].strip()}\nContent: {content}"
            return structured

        except Exception as e:
            print(f"Processing error: {e}")
            return "Headline: \nCategory: \nState: \nSubheading: \nContent: "

    results = []
    for snippet in df["News"]:
        transformed = transform_snippet(snippet)
        results.append(transformed)
        # print("Processed:", transformed[:100] + "...")

    def extract_fields(text):
        text = re.sub(r'\*+', '', text)
        fields = {
            "Headline": re.search(r'Headline:\s*(.*?)\n', text).group(1).strip() if re.search(r'Headline:', text) else "",
            "Category": re.search(r'Category:\s*(.*?)\n', text).group(1).strip() if re.search(r'Category:', text) else "",
            "State": re.search(r'State:\s*(.*?)\n', text).group(1).strip() if re.search(r'State:', text) else "",
            "Subheading": re.search(r'Subheading:\s*(.*?)\n', text).group(1).strip() if re.search(r'Subheading:', text) else "",
            "Content": re.search(r'Content:\s*(.*)', text).group(1).strip() if re.search(r'Content:', text) else ""
        }
        return fields

    extracted_data = [extract_fields(item) for item in results]
    final_df = pd.concat([df.reset_index(drop=True), pd.DataFrame(extracted_data)], axis=1)
    
    return final_df

# Image Processing Functions
def mm_to_pixels(mm, dpi=600):
    return int(mm * dpi / 25.4)

def load_image_from_url(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return Image.open(BytesIO(response.content)).convert("RGBA")
    except Exception as e:
        print(f"Error loading image from URL: {e}")
        return None

def add_edge_transparency(img, fade_percent=20):
    width, height = img.size
    fade_width = int(width * fade_percent / 100)
    fade_height = int(height * fade_percent / 100)
    
    alpha = Image.new('L', (width, height), 255)
    draw = ImageDraw.Draw(alpha)
    
    for x in range(fade_width):
        opacity = int(255 * (x / fade_width))
        draw.line([(x, 0), (x, height)], fill=opacity)
    
    for x in range(width - fade_width, width):
        opacity = int(255 * ((width - x) / fade_width))
        draw.line([(x, 0), (x, height)], fill=opacity)
    
    for y in range(fade_height):
        opacity = int(255 * (y / fade_height))
        draw.line([(0, y), (width, y)], fill=opacity)
    
    for y in range(height - fade_height, height):
        opacity = int(255 * ((height - y) / fade_height))
        draw.line([(0, y), (width, y)], fill=opacity)
    
    img.putalpha(alpha)
    return img

def add_bottom_gradient(img, fade_height_percent=50):
    width, height = img.size
    fade_height = int(height * fade_height_percent / 100)
    
    gradient = Image.new('L', (width, fade_height))
    for y in range(fade_height):
        alpha = int(255 * (1.2 * y / fade_height)**2)
        gradient.paste(alpha, (0, y, width, y+1))
    
    black_layer = Image.new('RGB', (width, fade_height), (0, 0, 0))
    black_layer.putalpha(gradient)
    
    result = img.convert('RGBA')
    result.paste(black_layer, (0, height - fade_height), black_layer)
    return result

def update_instagram_post(image_path, output_path, headline, subheading, subsubheading, 
                         logo_path=None, image_url=None, source="", publish_date="", category="", state=""):
    img = Image.open(image_path).convert("RGB")
    width, height = img.size
    
    if image_url:
        url_image = load_image_from_url(image_url)
        if url_image:
            side_margin = mm_to_pixels(10)
            max_image_width = width - 2 * side_margin
            
            url_img_width, url_img_height = url_image.size
            scale_factor = min(2.5, max_image_width / url_img_width)
            new_width = int(url_img_width * scale_factor)
            new_height = int(url_img_height * scale_factor)
            
            url_image = url_image.resize((new_width, new_height), Image.Resampling.LANCZOS)
            url_image = add_edge_transparency(url_image)
            
            x_position = side_margin + (max_image_width - new_width) // 2
            y_position = (height - new_height) // 2
            
            url_layer = Image.new('RGBA', img.size, (0, 0, 0, 0))
            url_layer.paste(url_image, (x_position, y_position))
            
            img = Image.alpha_composite(img.convert('RGBA'), url_layer)
    
    img = add_bottom_gradient(img, 40)
    
    logo_margin = 5
    if logo_path:
        try:
            logo = Image.open(logo_path).convert("RGBA")
            logo_size = mm_to_pixels(15)
            logo_height = int(logo_size * (logo.size[1] / logo.size[0]))
            logo = logo.resize((logo_size, logo_height), Image.Resampling.LANCZOS)
            
            logo_layer = Image.new('RGBA', img.size, (0, 0, 0, 0))
            logo_layer.paste(logo, (mm_to_pixels(logo_margin), mm_to_pixels(logo_margin)), logo)
            
            img = Image.alpha_composite(img, logo_layer)
        except Exception as e:
            print(f"Error loading logo: {e}")

    draw = ImageDraw.Draw(img)
    
    State_Size = 60
    Category_Size = 120
    Head_Size = 80
    SbHead_Size = 75
    B_Size = 50
    Source_Size = 40
    
    state_color = "red"
    category_color = "white"
    headline_color = "white"
    subheading_color = 'red'
    subsubheading_color = "white" 
    source_color = (200, 200, 200)
    box_fill = (112, 128, 144, 0)
    box_outline = "#000000"
    box_outline_width = mm_to_pixels(0.08)
    text_stroke_width = 2

    category_headline_space = 30
    headline_subheading_space = 60
    subheading_subsubheading_space = 40
    line_spacing = 2.5
    margin_px = 100
    max_text_width = width - (1 + 2 * margin_px)
    radius_px = mm_to_pixels(1.5)
    box_margin = 25
    source_margin = 20

    def load_font(font_paths, size):
        for path in font_paths:
            try:
                return ImageFont.truetype(path, size)
            except:
                continue
        return ImageFont.load_default(size)

    font_paths = [
        "times.ttf",
        "arial.ttf",
        "arialbd.ttf"
    ]
    
    state_font = load_font(font_paths, State_Size)
    category_font = load_font(font_paths, Category_Size)
    headline_font = load_font(font_paths, Head_Size)
    subhead_font = load_font(font_paths, SbHead_Size)
    body_font = load_font(font_paths, B_Size)
    source_font = load_font(font_paths, Source_Size)

    if state:
        state_text = f"{state.capitalize()}"  # "Gujarat" instead of "GUJARAT"
        state_x = mm_to_pixels(logo_margin) + 170  # Aligns with logo's left edge (default: 5mm)
        state_y = mm_to_pixels(logo_margin) + mm_to_pixels(20) - 125  # Below logo
        
        draw.text(
            (state_x, state_y),
            state_text,
            font=state_font,
            fill=state_color,
            anchor="mm" # "la" # Left-aligned
        )

    def get_text_height(text, font):
        if not text or pd.isna(text):
            return 0
        lines = textwrap.wrap(str(text), width=int(max_text_width/(font.size*0.4)))
        return (font.size + line_spacing) * len(lines)

    category_lines = textwrap.wrap(category.upper(), width=int(max_text_width/(Category_Size*0.7)))
    category_height = (Category_Size + line_spacing) * len(category_lines)
    headline_lines = textwrap.wrap(headline.upper(), width=int(max_text_width/(Head_Size*0.7)))
    headline_height = (Head_Size + line_spacing) * len(headline_lines)
    subhead_height = get_text_height(subheading, subhead_font)
    subsub_lines = textwrap.wrap(subsubheading, width=int(max_text_width/(B_Size*0.5)))
    subsub_height = (B_Size + line_spacing) * len(subsub_lines)

    min_content_space = (category_height + category_headline_space +
                        headline_height + headline_subheading_space + 
                        subhead_height + subheading_subsubheading_space + 
                        subsub_height + 2*box_margin)
    
    gradient_top = height - int(height * 0.4)
    available_space = gradient_top - margin_px
    
    if min_content_space > available_space:
        scale_factor = available_space / min_content_space
        Category_Size = int(Category_Size * scale_factor * 0.9)
        Head_Size = int(Head_Size * scale_factor * 0.9)
        SbHead_Size = int(SbHead_Size * scale_factor * 0.9)
        B_Size = int(B_Size * scale_factor * 0.9)
        
        category_font = load_font(font_paths, Category_Size)
        headline_font = load_font(font_paths, Head_Size)
        subhead_font = load_font(font_paths, SbHead_Size)
        body_font = load_font(font_paths, B_Size)
        
        category_lines = textwrap.wrap(category.upper(), width=int(max_text_width/(Category_Size*0.7)))
        category_height = (Category_Size + line_spacing) * len(category_lines)
        headline_lines = textwrap.wrap(headline.upper(), width=int(max_text_width/(Head_Size*0.7)))
        headline_height = (Head_Size + line_spacing) * len(headline_lines)
        subhead_height = get_text_height(str(subheading), subhead_font) if subheading else 0
        subsub_lines = textwrap.wrap(str(subsubheading), width=int(max_text_width/(B_Size*0.5))) if subsubheading else []
        subsub_height = (B_Size + line_spacing) * len(subsub_lines)

    current_y = height - margin_px - (category_height + category_headline_space +
                                    headline_height + headline_subheading_space + 
                                    subhead_height + subheading_subsubheading_space + 
                                    subsub_height + 2*box_margin)
    
    min_y = mm_to_pixels(logo_margin) + (mm_to_pixels(25) if logo_path else 0) + 20
    current_y = max(current_y, min_y)

    box_x1 = margin_px - box_margin
    box_x2 = width - margin_px + box_margin
    box_y1 = current_y + category_height + category_headline_space + headline_height + headline_subheading_space + subhead_height + subheading_subsubheading_space - box_margin
    box_y2 = box_y1 + subsub_height + 2 * box_margin
    box_y2 = min(box_y2, height - 10)

    box_layer = Image.new('RGBA', img.size, (0, 0, 0, 0))
    box_draw = ImageDraw.Draw(box_layer)
    box_draw.rounded_rectangle(
        [box_x1, box_y1, box_x2, box_y2],
        radius=radius_px,
        fill=box_fill,
        outline=box_outline,
        width=box_outline_width
    )
    img = Image.alpha_composite(img, box_layer)
    draw = ImageDraw.Draw(img)

    def draw_text_with_stroke(x, y, text, font, fill, stroke_fill="black", stroke_width=2):
        anchor = 'lm'
        for dx in [-stroke_width, stroke_width]:
            for dy in [-stroke_width, stroke_width]:
                draw.text((x + dx, y + dy), text, font=font, fill=stroke_fill, anchor=anchor)
        draw.text((x, y), text, font=font, fill=fill, anchor=anchor)

    for line in category_lines:
        text_width = category_font.getlength(line)
        text_height = Category_Size
        
        bg_y1 = current_y - (text_height // 3) - 11
        bg_y2 = current_y + (text_height * 2) // 3 - 11
        bg_x1 = margin_px - 10
        bg_x2 = margin_px + text_width + 10
        
        draw.rectangle(
            [bg_x1, bg_y1, bg_x2, bg_y2],
            fill="lime",
            outline=None
        )
        
        draw_text_with_stroke(
            margin_px, current_y, line,
            font=category_font,
            fill=category_color,
            stroke_width=text_stroke_width
        )
        current_y += Category_Size + line_spacing
    
    current_y += category_headline_space

    for line in headline_lines:
        draw_text_with_stroke(
            margin_px, current_y, line,
            font=headline_font,
            fill=headline_color,
            stroke_width=text_stroke_width
        )
        current_y += Head_Size + line_spacing
    
    current_y += headline_subheading_space - line_spacing
    
    for line in textwrap.wrap(subheading, width=int(max_text_width/(SbHead_Size*0.5))):
        draw_text_with_stroke(
            margin_px, current_y, line,
            font=subhead_font,
            fill=subheading_color,
            stroke_width=text_stroke_width
        )
        current_y += SbHead_Size + line_spacing
    
    current_y += subheading_subsubheading_space
    
    def draw_justified(text, font, start_y, color, max_height, original_font_size):
        y = start_y
        current_font_size = original_font_size
        font = load_font(font_paths, current_font_size)
        line_height = current_font_size + line_spacing
        lines = []
        min_font_size = int(original_font_size * 0.6)
    
        wrapped_lines = textwrap.wrap(text, width=int(max_text_width / (current_font_size * 0.5)))
        required_height = len(wrapped_lines) * line_height
    
        if required_height <= max_height:
            lines = wrapped_lines
        else:
            while current_font_size >= min_font_size and required_height > max_height:
                current_font_size -= 1
                font = load_font(font_paths, current_font_size)
                line_height = current_font_size + line_spacing
                wrapped_lines = textwrap.wrap(text, width=int(max_text_width / (current_font_size * 0.5)))
                required_height = len(wrapped_lines) * line_height
    
            lines = wrapped_lines
    
        if required_height > max_height and lines:
            lines = wrapped_lines[:max_height // line_height]
            if lines:
                lines[-1] = lines[-1][:max(0, len(lines[-1])-3)] + "..."
    
        y = start_y
        for line in lines:
            words = line.split()
            if len(words) > 1:
                total_width = sum(font.getlength(word) for word in words)
                space_width = (max_text_width - total_width) / (len(words) - 1)
                x = margin_px
                for word in words[:-1]:
                    draw.text((x, y), word, font=font, fill=color, anchor='lm')
                    x += font.getlength(word) + space_width
                draw.text((x, y), words[-1], font=font, fill=color, anchor='lm')
            else:
                draw.text((margin_px, y), line, font=font, fill=color, anchor='lm')
            y += line_height
    
    draw_justified(subsubheading, body_font, current_y, subsubheading_color, 
                  box_y2 - current_y, B_Size)

    if source or publish_date:
        source_text = f"Source: {source}" if source else ""
        date_text = f"Published: {publish_date}" if publish_date else ""
        info_text = " | ".join(filter(None, [source_text, date_text]))
        
        text_width = source_font.getlength(info_text)
        text_x = width - margin_px - text_width
        text_y = height - margin_px // 2
        
        draw.text(
            (text_x, text_y),
            info_text,
            font=source_font,
            fill=source_color,
            anchor="lm"
        )

    if output_path:
        img.convert("RGB").save(output_path, quality=95)
    return img

# Main Execution
def process_and_generate_images(df):
    # Process news data
    processed_df = process_news_dataframe(df)
    
    # Generate images for each news item
    for i in range(processed_df.shape[0]):
        headline = str(processed_df['Headline'].iloc[i]) if pd.notna(processed_df['Headline'].iloc[i]) else ""
        subheading = str(processed_df['Subheading'].iloc[i]) if pd.notna(processed_df['Subheading'].iloc[i]) else ""
        subsubheading = str(processed_df['Content'].iloc[i]) if pd.notna(processed_df['Content'].iloc[i]) else ""
        source = str(processed_df['Source'].iloc[i]) if pd.notna(processed_df['Source'].iloc[i]) else ""
        publish_date = str(processed_df['Published_Time'].iloc[i]) if pd.notna(processed_df['Published_Time'].iloc[i]) else ""
        category = str(processed_df['Category'].iloc[i]) if pd.notna(processed_df['Category'].iloc[i]) else ""
        state = str(processed_df['State'].iloc[i]) if pd.notna(processed_df['State'].iloc[i]) else ""
        
        # Check if URL is valid
        image_url = processed_df['Image_URL'].iloc[i]
        is_valid_url = False
        if pd.notna(image_url):
            try:
                parsed = urlparse(str(image_url))
                is_valid_url = all([parsed.scheme in ['http','https'], parsed.netloc])
            except:
                is_valid_url = False
        
        # Generate random template path
        template_path = f"{TEMPLATE_FOLDER}{random.randint(1, 6)}.png"
        
        # Create Instagram post
        update_instagram_post(        
            image_path=template_path,
            output_path=f"output_image_{i}.jpg",
            category=category,
            headline=headline,
            subheading=subheading,
            subsubheading=subsubheading,
            logo_path=LOGO_PATH,
            image_url=image_url if is_valid_url else None,
            source=source,
            publish_date=publish_date,
            state=state
        )
        print(f"Generated image for news item {i+1}")

# Example usage (assuming df is your input DataFrame)
# process_and_generate_images(df)

In [None]:
process_and_generate_images(final_df)

## New Model

In [2]:
import google.generativeai as genai
import pandas as pd
import re
import time
import requests
from io import BytesIO
from PIL import Image, ImageDraw, ImageFont
import textwrap
import random
from urllib.parse import urlparse
import logging
import feedparser
from bs4 import BeautifulSoup
import spacy
from collections import Counter
from datetime import datetime, timezone, timedelta
from pytz import timezone
import hashlib
from typing import Dict, List, Optional


In [3]:

# Configuration
API_KEY =  API_Key#"YOUR_API_KEY"  # Replace with your actual API key
TEMPLATE_FOLDER = "E:/Intapost_Templates/Square Templetes 2/"
LOGO_PATH = r"E:\Intapost_Templates\Square Templetes 2\9.png"

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('news_scraper.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Load English language model for NLP
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    logger.error("Spacy model 'en_core_web_sm' not found. Please install it first.")
    logger.info("Run: python -m spacy download en_core_web_sm")
    nlp = None

# Gemini AI Initialization
def initialize_gemini(api_key):
    """Initialize Gemini AI with API key and model selection."""
    genai.configure(api_key=api_key)

    model_names_to_try = [
        'gemini-1.5-flash',
        'gemini-1.5-pro',
        'gemini-pro'
    ]
    
    for model_name in model_names_to_try:
        try:
            model = genai.GenerativeModel(model_name)
            response = model.generate_content("Hello")
            if response.text:
                print(f"✅ Using model: {model_name}")
                return model
        except Exception as e:
            print(f"⚠️ Model {model_name} failed: {str(e)}")
    
    raise ValueError("❌ No working model found. Check API key/model availability.")

# Initialize model globally
try:
    model = initialize_gemini(API_KEY)
except Exception as e:
    print(f"Initialization failed: {e}")
    exit()

# News Processing
ENFORCED_PROMPT = """
Transform news snippets into concise 60-word articles with:

1. STRUCTURE:
Headline: [5-12 words with key terms]
Category: [Predefined category]
State: [Indian state or 'National' or 'International']
Subheading: [Core news in one sentence]
Content: [All key details within 60 words total]

2. CATEGORIES:
- Crime/Legal
- Politics/Government
- Business/Economy
- Health/Medicine
- Education/Research
- Technology/Science
- Environment/Climate
- International
- Human Interest
- Sports/Entertainment

3. RULES:
- Bold key terms: **Rs 500 crore scam**
- Preserve ALL critical facts (names, figures, locations)
- Neutral tone for news, analytical for opinions
- For investigations: Highlight methods → findings → consequences
- For opinions: Start with "PERSPECTIVE:"
"""

class NewsScraper:
    """A robust news scraper that collects articles from multiple RSS feeds."""
    
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'TE': 'Trailers'
        }
        
        self.session = requests.Session()
        self.session.headers.update(self.headers)
        self.session.max_redirects = 5
        self.timeout = 15
        
        self.news_sources = {
            'Hindustan Times': {'rss': 'https://www.hindustantimes.com/feeds/rss/latest-news/rssfeed.xml'},
            'The Hindu': {'rss': 'https://www.thehindu.com/feeder/default.rss'},
            'Indian Express': {'rss': 'https://indianexpress.com/feed/'},
            'BBC': {'rss': 'http://feeds.bbci.co.uk/news/rss.xml'},
            'CNN': {'rss': 'http://rss.cnn.com/rss/cnn_latest.rss'},
            'Reuters': {'rss': 'https://www.reutersagency.com/feed/?best-topics=tech&post_type=best'},
            'Al Jazeera': {'rss': 'https://www.aljazeera.com/xml/rss/all.xml'},
            'Money Control': {'rss': 'https://www.moneycontrol.com/rss/latestnews.xml'},
            'News18': {'rss': 'https://www.news18.com/rss/india.xml'}
        }
        
        self.cache = set()

    def _get_domain(self, url: str) -> str:
        """Extract domain from URL."""
        parsed = urlparse(url)
        return parsed.netloc.replace('www.', '')

    def _make_request(self, url: str, max_retries: int = 3) -> Optional[requests.Response]:
        """Make HTTP request with retries."""
        for attempt in range(max_retries):
            try:
                response = self.session.get(url, timeout=self.timeout)
                response.raise_for_status()
                if response.status_code == 200:
                    return response
            except requests.exceptions.RequestException as e:
                if attempt == max_retries - 1:
                    logger.error(f"Request failed for {url}: {str(e)}")
                time.sleep(2 ** attempt)
        return None

    def fetch_article_content(self, url: str) -> Dict[str, str]:
        """Extract full article content including paragraphs and images."""
        domain = self._get_domain(url)
        response = self._make_request(url)
        
        if not response:
            return {'summary': "", 'image': None, 'paragraphs': []}
            
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract paragraphs
        paragraphs = []
        try:
            for p in soup.find_all('p'):
                text = p.get_text(strip=True)
                if text and len(text.split()) > 5:
                    paragraphs.append(text)
        except Exception as e:
            logger.error(f"Error extracting paragraphs: {e}")
        
        # Extract main image
        image = None
        try:
            og_image = soup.find('meta', property='og:image')
            if og_image and og_image.get('content'):
                image = og_image['content']
            else:
                for img in soup.find_all('img'):
                    if int(img.get('width', 0)) > 300 or int(img.get('height', 0)) > 200:
                        image = img.get('src')
                        break
        except Exception as e:
            logger.error(f"Error extracting image: {e}")
        
        full_content = '\n\n'.join(paragraphs) if paragraphs else ""
        
        return {
            'summary': full_content[:2000] + ('...' if len(full_content) > 2000 else ''),
            'image': image,
            'paragraphs': paragraphs
        }

    def _generate_article_id(self, url: str, title: str) -> str:
        """Generate unique article ID."""
        return hashlib.md5(f"{url}_{title}".encode()).hexdigest()

    def fetch_rss_news(self, source_name: str, limit: int = 5) -> List[Dict]:
        """Fetch and process RSS feed entries."""
        if source_name not in self.news_sources:
            return []
            
        rss_url = self.news_sources[source_name]['rss']
        logger.info(f"Fetching {source_name} RSS feed")
        
        try:
            feed = feedparser.parse(rss_url)
            news_items = []
            
            for entry in feed.entries[:limit]:
                article_id = self._generate_article_id(entry.link, entry.title)
                if article_id in self.cache:
                    continue
                    
                self.cache.add(article_id)
                
                if "Today's news in 10 minutes" in entry.title:
                    continue
                
                article_content = self.fetch_article_content(entry.link)
                
                news_items.append({
                    'id': article_id,
                    'title': entry.title,
                    'link': entry.link,
                    'published': entry.get('published', ''),
                    'source': source_name,
                    'summary': article_content['summary'],
                    'paragraphs': article_content['paragraphs'],
                    'image': article_content['image'],
                })
                
                time.sleep(0.5)
                
            return news_items
            
        except Exception as e:
            logger.error(f"Error processing {source_name} feed: {e}")
            return []

    def get_all_news(self, limit_per_source: int = 5) -> List[Dict]:
        """Aggregate news from all sources."""
        all_news = []
        for source in self.news_sources:
            try:
                all_news.extend(self.fetch_rss_news(source, limit_per_source))
            except Exception as e:
                logger.error(f"Error processing {source}: {e}")
                
        return all_news

def process_news_data(news_items: List[Dict]) -> pd.DataFrame:
    """Process raw news items into structured DataFrame."""
    processed_news = []
    
    def identify_state(text):
        try:
            response = model.generate_content(
                f"Analyze text and return ONLY:\nState: [Indian state or 'National' or 'International']\n\nText: {text}"
            )
            return response.text.strip() if response.text else "National"
        except:
            return "National"

    def identify_category(text):
        try:
            response = model.generate_content(
                f"Classify into: Crime/Legal, Politics/Government, Business/Economy, Health/Medicine, Education/Research, Technology/Science, Environment/Climate, International, Human Interest, Sports/Entertainment. Reply ONLY with category.\n\nText: {text}"
            )
            return response.text.strip() if response.text else "General"
        except:
            return "General"

    def enforce_word_limit(text, limit=60):
        return ' '.join(text.split()[:limit])

    def transform_snippet(snippet):
        try:
            category = identify_category(snippet)
            state = identify_state(snippet)
            time.sleep(1)  # Rate limiting

            response = model.generate_content(
                f"{ENFORCED_PROMPT}\n\nInput News:\n{snippet}\n\nCategory: {category}\nState: {state}"
            )
            time.sleep(1)

            if not response.text:
                return "Headline: \nCategory: \nState: \nSubheading: \nContent: "

            structured = response.text
            if "Content:" in structured:
                parts = structured.split("Content:")
                content = enforce_word_limit(parts[1].strip())
                return f"{parts[0].strip()}\nContent: {content}"
            return structured

        except Exception as e:
            print(f"Processing error: {e}")
            return "Headline: \nCategory: \nState: \nSubheading: \nContent: "

    for item in news_items:
        snippet = f"{item['title']}\n\n{item['summary']}"
        transformed = transform_snippet(snippet)
        
        def extract_fields(text):
            text = re.sub(r'\*+', '', text)
            fields = {
                "Headline": re.search(r'Headline:\s*(.*?)\n', text).group(1).strip() if re.search(r'Headline:', text) else "",
                "Category": re.search(r'Category:\s*(.*?)\n', text).group(1).strip() if re.search(r'Category:', text) else "",
                "State": re.search(r'State:\s*(.*?)\n', text).group(1).strip() if re.search(r'State:', text) else "",
                "Subheading": re.search(r'Subheading:\s*(.*?)\n', text).group(1).strip() if re.search(r'Subheading:', text) else "",
                "Content": re.search(r'Content:\s*(.*)', text).group(1).strip() if re.search(r'Content:', text) else "",
                "Source": item['source'],
                "Published_Time": item['published'],
                "Image_URL": item['image']
            }
            return fields
        
        processed_news.append(extract_fields(transformed))
    
    return pd.DataFrame(processed_news)

# Image Processing Functions
def mm_to_pixels(mm, dpi=600):
    return int(mm * dpi / 25.4)

def load_image_from_url(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return Image.open(BytesIO(response.content)).convert("RGBA")
    except Exception as e:
        print(f"Error loading image from URL: {e}")
        return None

def add_edge_transparency(img, fade_percent=20):
    width, height = img.size
    fade_width = int(width * fade_percent / 100)
    fade_height = int(height * fade_percent / 100)
    
    alpha = Image.new('L', (width, height), 255)
    draw = ImageDraw.Draw(alpha)
    
    for x in range(fade_width):
        opacity = int(255 * (x / fade_width))
        draw.line([(x, 0), (x, height)], fill=opacity)
    
    for x in range(width - fade_width, width):
        opacity = int(255 * ((width - x) / fade_width))
        draw.line([(x, 0), (x, height)], fill=opacity)
    
    for y in range(fade_height):
        opacity = int(255 * (y / fade_height))
        draw.line([(0, y), (width, y)], fill=opacity)
    
    for y in range(height - fade_height, height):
        opacity = int(255 * ((height - y) / fade_height))
        draw.line([(0, y), (width, y)], fill=opacity)
    
    img.putalpha(alpha)
    return img

def add_bottom_gradient(img, fade_height_percent=50):
    width, height = img.size
    fade_height = int(height * fade_height_percent / 100)
    
    gradient = Image.new('L', (width, fade_height))
    for y in range(fade_height):
        alpha = int(255 * (1.2 * y / fade_height)**2)
        gradient.paste(alpha, (0, y, width, y+1))
    
    black_layer = Image.new('RGB', (width, fade_height), (0, 0, 0))
    black_layer.putalpha(gradient)
    
    result = img.convert('RGBA')
    result.paste(black_layer, (0, height - fade_height), black_layer)
    return result

def create_instagram_post(template_path, headline, subheading, subsubheading, 
                         logo_path=None, image_url=None, source="", publish_date="", category="", state=""):
    """Create and display Instagram post without saving to file."""
    img = Image.open(template_path).convert("RGB")
    width, height = img.size
    
    if image_url:
        url_image = load_image_from_url(image_url)
        if url_image:
            side_margin = mm_to_pixels(10)
            max_image_width = width - 2 * side_margin
            
            url_img_width, url_img_height = url_image.size
            scale_factor = min(2.5, max_image_width / url_img_width)
            new_width = int(url_img_width * scale_factor)
            new_height = int(url_img_height * scale_factor)
            
            url_image = url_image.resize((new_width, new_height), Image.Resampling.LANCZOS)
            url_image = add_edge_transparency(url_image)
            
            x_position = side_margin + (max_image_width - new_width) // 2
            y_position = (height - new_height) // 2
            
            url_layer = Image.new('RGBA', img.size, (0, 0, 0, 0))
            url_layer.paste(url_image, (x_position, y_position))
            
            img = Image.alpha_composite(img.convert('RGBA'), url_layer)
    
    img = add_bottom_gradient(img, 40)
    
    logo_margin = 5
    if logo_path:
        try:
            logo = Image.open(logo_path).convert("RGBA")
            logo_size = mm_to_pixels(15)
            logo_height = int(logo_size * (logo.size[1] / logo.size[0]))
            logo = logo.resize((logo_size, logo_height), Image.Resampling.LANCZOS)
            
            logo_layer = Image.new('RGBA', img.size, (0, 0, 0, 0))
            logo_layer.paste(logo, (mm_to_pixels(logo_margin), mm_to_pixels(logo_margin)), logo)
            
            img = Image.alpha_composite(img, logo_layer)
        except Exception as e:
            print(f"Error loading logo: {e}")

    draw = ImageDraw.Draw(img)
    
    State_Size = 65
    Category_Size = 120
    Head_Size = 80
    SbHead_Size = 75
    B_Size = 50
    Source_Size = 40
    
    state_color = "white"
    category_color = "white"
    headline_color = "white"
    subheading_color = 'red'
    subsubheading_color = "white" 
    source_color = (200, 200, 200)
    box_fill = (112, 128, 144, 0)
    box_outline = "#000000"
    box_outline_width = mm_to_pixels(0.08)
    text_stroke_width = 2

    category_headline_space = 30
    headline_subheading_space = 60
    subheading_subsubheading_space = 40
    line_spacing = 2.5
    margin_px = 100
    max_text_width = width - (1 + 2 * margin_px)
    radius_px = mm_to_pixels(1.5)
    box_margin = 25
    source_margin = 20

    def load_font(font_paths, size):
        for path in font_paths:
            try:
                return ImageFont.truetype(path, size)
            except:
                continue
        return ImageFont.load_default(size)

    font_paths = [
        "times.ttf",
        "arial.ttf",
        "arialbd.ttf"
    ]
    
    state_font = load_font(font_paths, State_Size)
    category_font = load_font(font_paths, Category_Size)
    headline_font = load_font(font_paths, Head_Size)
    subhead_font = load_font(font_paths, SbHead_Size)
    body_font = load_font(font_paths, B_Size)
    source_font = load_font(font_paths, Source_Size)

    if state:
        state_text = f"{state.capitalize()}"
        state_x = mm_to_pixels(logo_margin) + 170
        state_y = mm_to_pixels(logo_margin) + mm_to_pixels(20) - 125
        
        draw.text(
            (state_x, state_y),
            state_text,
            font=state_font,
            fill=state_color,
            anchor="mm"
        )

    def get_text_height(text, font):
        if not text or pd.isna(text):
            return 0
        lines = textwrap.wrap(str(text), width=int(max_text_width/(font.size*0.4)))
        return (font.size + line_spacing) * len(lines)

    category_lines = textwrap.wrap(category.upper(), width=int(max_text_width/(Category_Size*0.7)))
    category_height = (Category_Size + line_spacing) * len(category_lines)
    headline_lines = textwrap.wrap(headline.upper(), width=int(max_text_width/(Head_Size*0.7)))
    headline_height = (Head_Size + line_spacing) * len(headline_lines)
    subhead_height = get_text_height(subheading, subhead_font)
    subsub_lines = textwrap.wrap(subsubheading, width=int(max_text_width/(B_Size*0.5)))
    subsub_height = (B_Size + line_spacing) * len(subsub_lines)

    min_content_space = (category_height + category_headline_space +
                        headline_height + headline_subheading_space + 
                        subhead_height + subheading_subsubheading_space + 
                        subsub_height + 2*box_margin)
    
    gradient_top = height - int(height * 0.4)
    available_space = gradient_top - margin_px
    
    if min_content_space > available_space:
        scale_factor = available_space / min_content_space
        Category_Size = int(Category_Size * scale_factor * 0.9)
        Head_Size = int(Head_Size * scale_factor * 0.9)
        SbHead_Size = int(SbHead_Size * scale_factor * 0.9)
        B_Size = int(B_Size * scale_factor * 0.9)
        
        category_font = load_font(font_paths, Category_Size)
        headline_font = load_font(font_paths, Head_Size)
        subhead_font = load_font(font_paths, SbHead_Size)
        body_font = load_font(font_paths, B_Size)
        
        category_lines = textwrap.wrap(category.upper(), width=int(max_text_width/(Category_Size*0.7)))
        category_height = (Category_Size + line_spacing) * len(category_lines)
        headline_lines = textwrap.wrap(headline.upper(), width=int(max_text_width/(Head_Size*0.7)))
        headline_height = (Head_Size + line_spacing) * len(headline_lines)
        subhead_height = get_text_height(str(subheading), subhead_font) if subheading else 0
        subsub_lines = textwrap.wrap(str(subsubheading), width=int(max_text_width/(B_Size*0.5))) if subsubheading else []
        subsub_height = (B_Size + line_spacing) * len(subsub_lines)

    current_y = height - margin_px - (category_height + category_headline_space +
                                    headline_height + headline_subheading_space + 
                                    subhead_height + subheading_subsubheading_space + 
                                    subsub_height + 2*box_margin)
    
    min_y = mm_to_pixels(logo_margin) + (mm_to_pixels(25) if logo_path else 0) + 20
    current_y = max(current_y, min_y)

    box_x1 = margin_px - box_margin
    box_x2 = width - margin_px + box_margin
    box_y1 = current_y + category_height + category_headline_space + headline_height + headline_subheading_space + subhead_height + subheading_subsubheading_space - box_margin
    box_y2 = box_y1 + subsub_height + 2 * box_margin
    box_y2 = min(box_y2, height - 10)

    box_layer = Image.new('RGBA', img.size, (0, 0, 0, 0))
    box_draw = ImageDraw.Draw(box_layer)
    box_draw.rounded_rectangle(
        [box_x1, box_y1, box_x2, box_y2],
        radius=radius_px,
        fill=box_fill,
        outline=box_outline,
        width=box_outline_width
    )
    img = Image.alpha_composite(img, box_layer)
    draw = ImageDraw.Draw(img)

    def draw_text_with_stroke(x, y, text, font, fill, stroke_fill="black", stroke_width=2):
        anchor = 'lm'
        for dx in [-stroke_width, stroke_width]:
            for dy in [-stroke_width, stroke_width]:
                draw.text((x + dx, y + dy), text, font=font, fill=stroke_fill, anchor=anchor)
        draw.text((x, y), text, font=font, fill=fill, anchor=anchor)

    for line in category_lines:
        text_width = category_font.getlength(line)
        text_height = Category_Size
        
        bg_y1 = current_y - (text_height // 3) - 11
        bg_y2 = current_y + (text_height * 2) // 3 - 11
        bg_x1 = margin_px - 10
        bg_x2 = margin_px + text_width + 10
        
        draw.rectangle(
            [bg_x1, bg_y1, bg_x2, bg_y2],
            fill="lime",
            outline=None
        )
        
        draw_text_with_stroke(
            margin_px, current_y, line,
            font=category_font,
            fill=category_color,
            stroke_width=text_stroke_width
        )
        current_y += Category_Size + line_spacing
    
    current_y += category_headline_space

    for line in headline_lines:
        draw_text_with_stroke(
            margin_px, current_y, line,
            font=headline_font,
            fill=headline_color,
            stroke_width=text_stroke_width
        )
        current_y += Head_Size + line_spacing
    
    current_y += headline_subheading_space - line_spacing
    
    for line in textwrap.wrap(subheading, width=int(max_text_width/(SbHead_Size*0.5))):
        draw_text_with_stroke(
            margin_px, current_y, line,
            font=subhead_font,
            fill=subheading_color,
            stroke_width=text_stroke_width
        )
        current_y += SbHead_Size + line_spacing
    
    current_y += subheading_subsubheading_space
    
    def draw_justified(text, font, start_y, color, max_height, original_font_size):
        y = start_y
        current_font_size = original_font_size
        font = load_font(font_paths, current_font_size)
        line_height = current_font_size + line_spacing
        lines = []
        min_font_size = int(original_font_size * 0.6)
    
        wrapped_lines = textwrap.wrap(text, width=int(max_text_width / (current_font_size * 0.5)))
        required_height = len(wrapped_lines) * line_height
    
        if required_height <= max_height:
            lines = wrapped_lines
        else:
            while current_font_size >= min_font_size and required_height > max_height:
                current_font_size -= 1
                font = load_font(font_paths, current_font_size)
                line_height = current_font_size + line_spacing
                wrapped_lines = textwrap.wrap(text, width=int(max_text_width / (current_font_size * 0.5)))
                required_height = len(wrapped_lines) * line_height
    
            lines = wrapped_lines
    
        if required_height > max_height and lines:
            lines = wrapped_lines[:max_height // line_height]
            if lines:
                lines[-1] = lines[-1][:max(0, len(lines[-1])-3)] + "..."
    
        y = start_y
        for line in lines:
            words = line.split()
            if len(words) > 1:
                total_width = sum(font.getlength(word) for word in words)
                space_width = (max_text_width - total_width) / (len(words) - 1)
                x = margin_px
                for word in words[:-1]:
                    draw.text((x, y), word, font=font, fill=color, anchor='lm')
                    x += font.getlength(word) + space_width
                draw.text((x, y), words[-1], font=font, fill=color, anchor='lm')
            else:
                draw.text((margin_px, y), line, font=font, fill=color, anchor='lm')
            y += line_height
    
    draw_justified(subsubheading, body_font, current_y, subsubheading_color, 
                  box_y2 - current_y, B_Size)

    if source or publish_date:
        source_text = f"Source: {source}" if source else ""
        date_text = f"Published: {publish_date}" if publish_date else ""
        info_text = " | ".join(filter(None, [source_text, date_text]))
        
        text_width = source_font.getlength(info_text)
        text_x = width - margin_px - text_width
        text_y = height - margin_px // 2
        
        draw.text(
            (text_x, text_y),
            info_text,
            font=source_font,
            fill=source_color,
            anchor="lm"
        )

    img.show()
    img.save(f"E:\\Intapost_Templates\\Download\\{random.randint(1, 50)}.png")

def generate_images_from_news():
    """Main function to scrape news, process it, and generate images."""
    scraper = NewsScraper()
    news_items = scraper.get_all_news(limit_per_source=2)  # Get 2 articles per source
    
    if not news_items:
        print("No news articles found.")
        return
    
    processed_df = process_news_data(news_items)
    
    for i in range(processed_df.shape[0]):
        headline = str(processed_df['Headline'].iloc[i]) if pd.notna(processed_df['Headline'].iloc[i]) else ""
        subheading = str(processed_df['Subheading'].iloc[i]) if pd.notna(processed_df['Subheading'].iloc[i]) else ""
        subsubheading = str(processed_df['Content'].iloc[i]) if pd.notna(processed_df['Content'].iloc[i]) else ""
        source = str(processed_df['Source'].iloc[i]) if pd.notna(processed_df['Source'].iloc[i]) else ""
        publish_date = str(processed_df['Published_Time'].iloc[i]) if pd.notna(processed_df['Published_Time'].iloc[i]) else ""
        category = str(processed_df['Category'].iloc[i]) if pd.notna(processed_df['Category'].iloc[i]) else ""
        state = str(processed_df['State'].iloc[i]) if pd.notna(processed_df['State'].iloc[i]) else ""
        
        # Check if URL is valid
        image_url = processed_df['Image_URL'].iloc[i]
        is_valid_url = False
        if pd.notna(image_url):
            try:
                parsed = urlparse(str(image_url))
                is_valid_url = all([parsed.scheme in ['http','https'], parsed.netloc])
            except:
                is_valid_url = False
        
        # Generate random template path
        template_path = f"{TEMPLATE_FOLDER}{random.randint(1, 6)}.png"
        
        print(f"\nGenerating image for news item {i+1}: {headline[:50]}...")
        
        # Create and display Instagram post
        # create_instagram_post(
        #     image_path=template_path,
        #     category=category,
        #     headline=headline,
        #     subheading=subheading,
        #     subsubheading=subsubheading,
        #     logo_path=LOGO_PATH,
        #     image_url=image_url if is_valid_url else None,
        #     source=source,
        #     publish_date=publish_date,
        #     state=state
        # )

        create_instagram_post(
            template_path=template_path,  # Changed from image_path to template_path
            category=category,
            headline=headline,
            subheading=subheading,
            subsubheading=subsubheading,
            logo_path=LOGO_PATH,
            image_url=image_url if is_valid_url else None,
            source=source,
            publish_date=publish_date,
            state=state
        )





# API_KEY =  API_Key#"YOUR_API_KEY"  # Replace with your actual API key
# TEMPLATE_FOLDER = r"E:\Intapost_Templates\Square Templetes 2"
# LOGO_PATH = r"E:\Intapost_Templates\Square Templetes 2\9.png"
# "E:\Intapost_Templates\Square Templetes 2\0.png"

✅ Using model: gemini-1.5-flash


In [None]:
if __name__ == "__main__":
    generate_images_from_news()

In [4]:
import threading
import time
from queue import Queue

# Add this class to manage the scraping process
class ScrapingController:
    def __init__(self):
        self.should_run = False
        self.scraping_queue = Queue()
        self.scraper = NewsScraper()
    
    def start_scraping(self):
        self.should_run = True
        print("\n🚀 News scraping started! Generating images every minute...")
        threading.Thread(target=self._continuous_scraping, daemon=True).start()
    
    def stop_scraping(self):
        self.should_run = False
        print("\n🛑 Scraping stopped. No new images will be generated.")
    
    def _continuous_scraping(self):
        while self.should_run:
            try:
                news_items = self.scraper.get_all_news(limit_per_source=2)
                if news_items:
                    processed_df = process_news_data(news_items)
                    self._generate_images(processed_df)
            except Exception as e:
                print(f"⚠️ Error during scraping: {e}")
            
            # Wait for 1 minute before next scrape
            for _ in range(60):
                if not self.should_run:
                    return
                time.sleep(1)
    
    def _generate_images(self, processed_df):
        for i in range(processed_df.shape[0]):
            headline = str(processed_df['Headline'].iloc[i]) if pd.notna(processed_df['Headline'].iloc[i]) else ""
            subheading = str(processed_df['Subheading'].iloc[i]) if pd.notna(processed_df['Subheading'].iloc[i]) else ""
            subsubheading = str(processed_df['Content'].iloc[i]) if pd.notna(processed_df['Content'].iloc[i]) else ""
            source = str(processed_df['Source'].iloc[i]) if pd.notna(processed_df['Source'].iloc[i]) else ""
            publish_date = str(processed_df['Published_Time'].iloc[i]) if pd.notna(processed_df['Published_Time'].iloc[i]) else ""
            category = str(processed_df['Category'].iloc[i]) if pd.notna(processed_df['Category'].iloc[i]) else ""
            state = str(processed_df['State'].iloc[i]) if pd.notna(processed_df['State'].iloc[i]) else ""
            
            image_url = processed_df['Image_URL'].iloc[i]
            is_valid_url = False
            if pd.notna(image_url):
                try:
                    parsed = urlparse(str(image_url))
                    is_valid_url = all([parsed.scheme in ['http','https'], parsed.netloc])
                except:
                    is_valid_url = False
            
            template_path = f"{TEMPLATE_FOLDER}{random.randint(1, 6)}.png"
            
            print(f"\n📰 Generated image for: {headline[:50]}...")
            
            create_instagram_post(
                template_path=template_path,
                category=category,
                headline=headline,
                subheading=subheading,
                subsubheading=subsubheading,
                logo_path=LOGO_PATH,
                image_url=image_url if is_valid_url else None,
                source=source,
                publish_date=publish_date,
                state=state
            )

# Replace the if __name__ == "__main__" block with this:
if __name__ == "__main__":
    controller = ScrapingController()
    
    print("📰 News Scraper with Image Generator")
    print("Type 'start' to begin scraping and 'stop' to end")
    print("Images will be generated every minute while running\n")
    
    while True:
        command = input("Enter command (start/stop/exit): ").strip().lower()
        
        if command == "start":
            if not controller.should_run:
                controller.start_scraping()
            else:
                print("Scraping is already running!")
        
        elif command == "stop":
            if controller.should_run:
                controller.stop_scraping()
            else:
                print("Scraping isn't currently running")
        
        elif command == "exit":
            controller.stop_scraping()
            print("👋 Exiting program...")
            break
        
        else:
            print("Invalid command. Please use 'start', 'stop', or 'exit'")

📰 News Scraper with Image Generator
Type 'start' to begin scraping and 'stop' to end
Images will be generated every minute while running



Enter command (start/stop/exit):  start



🚀 News scraping started! Generating images every minute...


2025-07-16 06:34:53,394 - INFO - Fetching Hindustan Times RSS feed
2025-07-16 06:34:54,375 - INFO - Fetching The Hindu RSS feed
2025-07-16 06:34:56,646 - INFO - Fetching Indian Express RSS feed
2025-07-16 06:34:59,912 - INFO - Fetching BBC RSS feed


Enter command (start/stop/exit):  stop



🛑 Scraping stopped. No new images will be generated.


2025-07-16 06:35:03,852 - INFO - Fetching CNN RSS feed


Enter command (start/stop/exit):  exit



🛑 Scraping stopped. No new images will be generated.
👋 Exiting program...


2025-07-16 06:35:10,961 - INFO - Fetching Reuters RSS feed
2025-07-16 06:35:12,785 - INFO - Fetching Al Jazeera RSS feed
2025-07-16 06:35:15,311 - INFO - Fetching Money Control RSS feed
2025-07-16 06:35:19,291 - ERROR - Request failed for https://www.moneycontrol.com/news/recommendations/buy-hdfc-bank-targetrs-1850-icici-securities_17531671.html: 403 Client Error: Forbidden for url: https://www.moneycontrol.com/news/recommendations/buy-hdfc-bank-targetrs-1850-icici-securities_17531671.html
2025-07-16 06:35:27,069 - ERROR - Request failed for https://www.moneycontrol.com/news/recommendations/buy-tejas-networks-targetrs-1100-emkay-global-financial_17531621.html: 403 Client Error: Forbidden for url: https://www.moneycontrol.com/news/recommendations/buy-tejas-networks-targetrs-1100-emkay-global-financial_17531621.html
2025-07-16 06:35:31,575 - INFO - Fetching News18 RSS feed



📰 Generated image for: Jaishankar Condemns Terrorism at SCO Meeting; UIDA...

📰 Generated image for: Kamaraj's Electoral Triumphs in Tamil Nadu Bye-Ele...

📰 Generated image for: Bombay HC Rejects Plea to Reopen Mumbai's Kabootar...

📰 Generated image for: Rajasthan Court Suspends Sentences of Congress MLA...

📰 Generated image for: MasterChef Host John Torode Sacked After Racist Re...

📰 Generated image for: UK's Secret Afghan Resettlement: 4,500 Relocated A...

📰 Generated image for: NH Mom Wins License Plate Battle: "PB4WEGO"...

📰 Generated image for: Russia-Ukraine War: Day 1238 Key Events...

📰 Generated image for: AI-Fueled Disinformation Intensifies Philippine Po...

📰 Generated image for: ICICI Securities Recommends HDFC Bank Purchase; Rs...

📰 Generated image for: Emkay Global Recommends Tejas Networks Buy, Target...

📰 Generated image for: Supreme Court Upholds Life Sentences in 2003 Benga...

📰 Generated image for: Maharashtra Couple Arrested for Throwing Newborn f...


In [None]:
import gc
from PIL import Image
import numpy as np
from IPython.display import display, clear_output
import tracemalloc

class MemorySafeScraper:
    def __init__(self):
        self.is_running = False
        self.image_counter = 0
        self.max_images = 10  # Limit before cleanup
        tracemalloc.start()
        
    async def generate_safe_images(self, df):
        """Memory-optimized image generation pipeline"""
        for i in range(len(df)):
            if not self.is_running:
                break
                
            try:
                # Process data first
                row = df.iloc[i]
                template_path = f"{TEMPLATE_FOLDER}{random.randint(1, 6)}.png"
                
                # Create image with manual memory control
                img = await self._create_image_with_cleanup(
                    template_path=template_path,
                    category=str(row['Category']),
                    headline=str(row['Headline']),
                    subheading=str(row['Subheading']),
                    subsubheading=str(row['Content']),
                    logo_path=LOGO_PATH,
                    image_url=str(row['Image_URL']) if pd.notna(row['Image_URL']) else None,
                    source=str(row['Source']),
                    publish_date=str(row['Published_Time']),
                    state=str(row['State'])
                )
                
                # Controlled display
                await self._safe_display(img)
                self.image_counter += 1
                
                # Proactive cleanup
                if self.image_counter % 3 == 0:
                    await self._deep_cleanup()
                    
            except Exception as e:
                print(f"⚠️ Error on item {i}: {str(e)[:100]}...")
                await self._emergency_cleanup()

    async def _create_image_with_cleanup(self, **kwargs):
        """Wrapper with memory safeguards"""
        try:
            # Create in separate thread with memory limit
            return await asyncio.to_thread(
                self._create_image, 
                **kwargs
            )
        finally:
            # Immediate cleanup of temporary objects
            gc.collect()
            
    def _create_image(self, **kwargs):
        """Original image creation with pixel buffer management"""
        # Convert template to numpy array for better memory control
        with Image.open(kwargs['template_path']) as template:
            img_array = np.array(template)
            
        # Process image through all original steps
        img = create_instagram_post(**kwargs)
        
        # Convert back to PIL Image and downscale if too large
        if img.size[0] * img.size[1] > 2000*2000:  # If over 4MP
            img = img.resize((int(img.size[0]*0.8), (int(img.size[1]*0.8)), 
                          Image.Resampling.LANCZOS) ) 
        
        return img

    async def _safe_display(self, img):
        """Memory-efficient display"""
        try:
            # Convert to JPEG to reduce memory footprint
            with BytesIO() as buffer:
                img.save(buffer, format='JPEG', quality=85)
                buffer.seek(0)
                display_img = Image.open(buffer)
                
            clear_output(wait=True)
            display(display_img)
            print(f"📰 Displayed image {self.image_counter}")
            
        except Exception as e:
            print(f"🚨 Display error: {str(e)[:100]}...")
        finally:
            del img
            gc.collect()

    async def _deep_cleanup(self):
        """Aggressive memory cleanup"""
        print("🧹 Performing deep cleanup...")
        gc.collect()
        # Clear matplotlib and IPython outputs
        plt.close('all')
        clear_output(wait=False)
        
        # Check memory usage
        current, peak = tracemalloc.get_traced_memory()
        print(f"Memory: {current/10**6:.1f}MB (Peak: {peak/10**6:.1f}MB)")
        
        # Reset counter periodically
        if self.image_counter >= self.max_images:
            print("♻️ Resetting image counter")
            self.image_counter = 0

    async def _emergency_cleanup(self):
        """Critical memory recovery"""
        print("🚨 EMERGENCY MEMORY CLEANUP")
        gc.collect()
        clear_output(wait=False)
        await asyncio.sleep(1)  # Let system recover

# Modified control function
def jupyter_control():
    controller = MemorySafeScraper()
    
    async def start_scraping():
        controller.is_running = True
        scraper = NewsScraper()
        while controller.is_running:
            try:
                news_items = await asyncio.to_thread(
                    scraper.get_all_news, 
                    limit_per_source=1
                )
                if news_items:
                    processed_df = await asyncio.to_thread(
                        process_news_data, 
                        news_items
                    )
                    await controller.generate_safe_images(processed_df)
                    
                await asyncio.sleep(60)  # Throttle scraping
            except Exception as e:
                print(f"🔴 Main error: {str(e)[:100]}...")
                await controller._emergency_cleanup()

    def start(b):
        asyncio.create_task(start_scraping())
    
    def stop(b):
        controller.is_running = False
        print("🛑 Stopping safely...")
        asyncio.create_task(controller._deep_cleanup())

    # Create Jupyter widgets
    from ipywidgets import Button, HBox
    start_btn = Button(description="Start", button_style='success')
    stop_btn = Button(description="Stop", button_style='danger')
    
    start_btn.on_click(start)
    stop_btn.on_click(stop)
    
    display(HBox([start_btn, stop_btn]))

# Run this in Jupyter
jupyter_control()