In [1]:
!pip -q install requests beautifulsoup4 lxml pandas dateparser tqdm


[notice] A new release of pip available: 22.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import re, time
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import dateparser

BASE = "https://www.123telugu.com"
HEADERS = {"User-Agent":"Mozilla/5.0 (compatible; Latest123TeluguScraper/1.0)"}
TIMEOUT = 20

def soup(url):
    r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
    r.raise_for_status()
    return BeautifulSoup(r.text, "lxml")

def clean(s): 
    return re.sub(r"\s+"," ", s or "").strip()

def parse_date(text):
    if not text: return None
    d = dateparser.parse(text)
    return d.isoformat() if d else None

def discover_latest_links(max_n=15):
    # Grab links from homepage and movie news listing; keep first 15 unique article urls
    seeds = [BASE, urljoin(BASE, "/mnews")]
    seen, ordered = set(), []
    for seed in seeds:
        try:
            sp = soup(seed)
        except Exception as e:
            print(f"[warn] {seed}: {e}")
            continue

        # typical article links end with .html and live under /news/ or /mnews/ or /reviews/
        for a in sp.select("a[href]"):
            href = a.get("href","").strip()
            if href.startswith("/"):
                href = urljoin(BASE, href)
            if href.startswith(BASE) and href.endswith(".html") and (
                "/news/" in href or "/mnews/" in href or "/reviews/" in href
            ):
                if href not in seen:
                    seen.add(href)
                    ordered.append(href)
                if len(ordered) >= max_n:
                    return ordered
        time.sleep(0.3)  # be polite
    return ordered[:max_n]

def parse_article(url):
    sp = soup(url)

    title = (
        (sp.find("meta", property="og:title") or {}).get("content")
        or (sp.find("meta", attrs={"name":"twitter:title"}) or {}).get("content")
        or (sp.find("h1") or {}).get_text()
        or (sp.title.string if sp.title else None)
    )
    title = clean(title)

    published = (
        (sp.find("meta", property="article:published_time") or {}).get("content")
        or (sp.find("time") or {}).get("datetime")
        or (sp.find("time") or {}).get_text()
    )
    published = parse_date(clean(published) if published else None)

    author = (
        (sp.find("meta", attrs={"name":"author"}) or {}).get("content")
        or (sp.find("a", rel="author") or {}).get_text()
        or (sp.find(class_=re.compile("author", re.I)) or {}).get_text()
    )
    author = clean(author)

    # content blocks (fallback-friendly)
    candidates = [
        "article .entry-content p",
        ".td-post-content p",
        ".post-content p",
        ".entry-content p",
        "article p",
        "div[itemprop='articleBody'] p",
    ]
    nodes = []
    for sel in candidates:
        nodes = sp.select(sel)
        if len(nodes) >= 3:
            break
    if not nodes:
        nodes = sp.select("p")

    paras = []
    for p in nodes:
        txt = clean(p.get_text(" "))
        if len(txt) >= 30 and "copyright" not in txt.lower():
            paras.append(txt)
    content = "\n\n".join(paras).strip()

    return {
        "title": title,
        "url": url,
        "published": published,
        "author": author,
        "content_preview": (content[:400] + "…") if len(content) > 400 else content
    }

# ---- RUN ----
links = discover_latest_links(max_n=15)

articles = []
for i, link in enumerate(links, 1):
    try:
        art = parse_article(link)
        articles.append(art)
        # print to console nicely
        print(f"\n#{i}. {art['title']}")
        print(f"URL: {art['url']}")
        print(f"Published: {art['published'] or 'N/A'} | Author: {art['author'] or 'N/A'}")
        print("-" * 80)
        print(art["content_preview"] or "[No content found]")
        print("=" * 80)
    except Exception as e:
        print(f"[warn] failed: {link} -> {e}")
    time.sleep(0.8)  # polite delay



#1. Kantara Chapter 1’s Hindi version achieves a crucial milestone
URL: https://www.123telugu.com/mnews/kantara-chapter-1s-hindi-version-achieves-a-crucial-milestone-prp.html
Published: 2025-10-14T18:33:11+00:00 | Author: 123telugu
--------------------------------------------------------------------------------
Rishab Shetty’s Kantara: Chapter 1 is continuing its strong momentum at the box office. In its second weekend, the action drama witnessed a solid jump in revenues, proving the massive acceptance among the audience.

While the Kannada version is storming the ticket windows, the Hindi version is also holding well. The latest update reveals that Kantara: Chapter 1 has entered the Rs. 150 crore nett c…

#2. I want to make a love story with Mahesh Babu says sensational young director
URL: https://www.123telugu.com/mnews/i-want-to-make-a-love-story-with-mahesh-babu-says-sensational-young-director-avd.html
Published: 2025-10-14T19:30:23+00:00 | Author: 123telugu
----------------------