In [1]:
# AgTech Link Summarizer


# @title Step 1: Install Required Libraries
!pip install pandas requests beautifulsoup4 openpyxl -q
!pip install sumy==0.11.0 -q  # Specific version that works reliably
!pip install google-search-results -q
!python -m nltk.downloader punkt  # Download NLTK tokenizer data

# @title Step 2: Import Libraries with NLTK Setup
import pandas as pd
from serpapi import GoogleSearch
from bs4 import BeautifulSoup
import requests
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from datetime import datetime
from google.colab import files
import re
import nltk

# Verify NLTK data is properly downloaded
nltk.download('punkt', quiet=True)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    !python -m nltk.downloader punkt -f
    nltk.download('punkt')

# @title Step 3: Define Functions
def get_agtech_links():
    """Fetch AgTech news links using SerpAPI"""
    params = {
        "q": "AgTech news OR precision agriculture OR smart farming",
        "hl": "en",
        "gl": "us",
        "num": "5",  # Reduced to 5 for demo
        "api_key": "e714bf5b6ae1c9b906a344de50cd5ef13362a9079930de834e980d1f7cc10dd6"
    }

    try:
        search = GoogleSearch(params)
        results = search.get_dict()
        return [result['link'] for result in results.get('organic_results', [])[:5]]
    except:
        # Fallback to manual list if API fails
        return [
            "https://www.agritechtomorrow.com",
            "https://www.agriculture.com/technology",
            "https://www.precisionag.com",
            "https://www.agweb.com/news/business/technology",
            "https://agtechnews.com"
        ]

def clean_text(text):
    """Clean and normalize text for Excel"""
    if not isinstance(text, str):
        return ""
    text = ' '.join(text.split())  # Remove extra whitespace
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII chars
    text = text[:3000]  # Limit length
    return text

def summarize_url(url):
    """Extract and summarize webpage content"""
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Remove unwanted elements
        for element in soup(['script', 'style', 'nav', 'footer', 'iframe']):
            element.decompose()

        # Get main content
        text = ' '.join([p.get_text() for p in soup.find_all('p')[:10]])  # First 10 paragraphs only
        text = clean_text(text)

        # Summarize with error handling
        try:
            parser = PlaintextParser.from_string(text, Tokenizer("english"))
            summarizer = LsaSummarizer()
            summary = summarizer(parser.document, sentences_count=2)  # Reduced to 2 sentences
            return ' '.join([str(s) for s in summary])
        except:
            return text[:500] + "..."  # Fallback to first 500 chars if summarization fails

    except Exception as e:
        return f"Could not process: {str(e)[:100]}"

# @title Step 4: Run the Summarizer
print("🔍 Fetching AgTech articles...")
links = get_agtech_links()

print("\n📝 Generating summaries...")
results = []
for i, url in enumerate(links, 1):
    print(f"  {i}/{len(links)} Processing: {url[:60]}...")
    summary = summarize_url(url)
    results.append({
        'URL': url,
        'Summary': clean_text(summary),
        'Date': datetime.now().strftime('%Y-%m-%d')
    })

# @title Step 5: View and Download Results
df = pd.DataFrame(results)
print("\n✅ Done! First 3 results:")
display(df.head(3))

# Save to Excel
filename = f"AgTech_Summaries_{datetime.now().strftime('%Y%m%d_%H%M')}.xlsx"
df.to_excel(filename, index=False)
print(f"\n💾 Downloading {filename}...")
files.download(filename)

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for breadability (setup.py) ... [?25l[?25hdone
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for google-search-results (setup.py) ... [?25l[?25hdone
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
🔍 Fetching AgTech articles...

📝 Generating summaries...
  1/5 Processing: https://igrownews.com/...
  2/5 Processing: https://www.numberanalytics.com/blog/precision-farming-moder...
  3/5 Processing: https://agtechnews.com/Precision-Ag-News/...
  4/5 Processing: https://www.gpsworld.com/prec

Unnamed: 0,URL,Summary,Date
0,https://igrownews.com/,From groundbreaking advancements in precision ...,2025-04-04
1,https://www.numberanalytics.com/blog/precision...,Easy statistical analysis tool Learn more Clos...,2025-04-04
2,https://agtechnews.com/Precision-Ag-News/,View more information ...,2025-04-04



💾 Downloading AgTech_Summaries_20250404_0816.xlsx...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>