# Data Scraping:

In [10]:
import requests
from bs4 import BeautifulSoup
import time
import random
from urllib.parse import urljoin

In [5]:
# main index page of Bharathiyar Poems on TVA
index_url = "https://www.tamilvu.org/library/l9100/html/l9100ba1.htm"
base_url = "https://www.tamilvu.org/library/l9100/html/"

In [16]:
def scrape_bharathi_links():
    print("Connecting to Tamil Virtual Academy...")
    response = requests.get(index_url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')

    poem_links = []
    
    # We look for all links that contain 'l9100pd1.jsp'
    # These are the actual poem pages
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        
        if 'l9100pd1.jsp' in href:
            # These links start with /slet/, so we join them to the domain root
            full_url = urljoin(base_url, href)
            if full_url not in poem_links:
                poem_links.append(full_url)
    return poem_links

poem_links = scrape_bharathi_links()
print(f"Found {len(poem_links)} poem links.")
print("Sample links:", poem_links[:15] if poem_links else "No links found")

Connecting to Tamil Virtual Academy...
Found 331 poem links.
Sample links: ['https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=1', 'https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=2', 'https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=3', 'https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=4', 'https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=5', 'https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=6', 'https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=7', 'https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=8', 'https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=9', 'https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=10', 'https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=11', 'https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=12', 'https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=13', 'https://www.tamilvu.org/slet/l9100/

In [17]:
def scrape_bharathi_poems(poem_links):
    poems_data = []
    for idx, link in enumerate(poem_links):
        print(f"Scraping poem {idx+1}/{len(poem_links)}: {link}")
        try:
            response = requests.get(link)
            response.encoding = 'utf-8'
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract the poem title
            title_tag = soup.find('h2')
            title = title_tag.get_text(strip=True) if title_tag else "Untitled"

            # Extract the poem content
            content_div = soup.find('div', class_='poem-content')
            if not content_div:
                content_div = soup.find('div', class_='poem')  # fallback

            content = content_div.get_text(separator='\n', strip=True) if content_div else "No content found"

            poems_data.append({
                'title': title,
                'content': content,
                'url': link
            })
        except Exception as e:
            print(f"Error scraping {link}: {e}")
        
        # Be polite and avoid overwhelming the server
        time.sleep(random.uniform(0.5, 1.5))
    
    return poems_data

In [19]:
# 1. Get the data using your modular function
all_poems = scrape_bharathi_poems(poem_links)

# 2. Open the file to write the training data
with open('input.txt', 'w', encoding='utf-8') as f:
    for poem in all_poems:
        # We only want to write if there is actual content
        if poem['content'] != "No content found":
            # Optional: Write the title as a header for the model to see
            f.write(f"{poem['title']}\n") 
            f.write(poem['content'])
            # Add extra newlines to separate poems clearly
            f.write("\n\n")

print(f"File 'input.txt' created successfully with {len(all_poems)} poems!")

Scraping poem 1/331: https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=1
Scraping poem 2/331: https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=2
Scraping poem 3/331: https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=3
Scraping poem 4/331: https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=4
Scraping poem 5/331: https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=5
Scraping poem 6/331: https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=6
Scraping poem 7/331: https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=7
Scraping poem 8/331: https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=8
Scraping poem 9/331: https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=9
Scraping poem 10/331: https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=10
Scraping poem 11/331: https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=11
Scraping poem 12/331: https://www.tamilvu.org/slet

In [20]:
all_poems[:2]  # Show the first 2 poems as a sample

[{'title': 'Untitled',
  'content': 'No content found',
  'url': 'https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=1'},
 {'title': 'Untitled',
  'content': 'No content found',
  'url': 'https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=2'}]

In [22]:
test_link = "https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=2"

def test_single_scrape(url):
    print(f"Testing Scrape on: {url}\n" + "-"*30)
    
    response = requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')

    # 1. Test Title Extraction
    title_tag = soup.find('font', color="#990000")
    title = title_tag.get_text(strip=True) if title_tag else "Title Not Found"
    print(f"TITLE FOUND: {title}")

    # 2. Test Content Extraction
    print("\nCONTENT FOUND:")
    poem_lines = []
    
    # We look for the font tags used for poem text
    for font_tag in soup.find_all('font', face="GIST-TMOTChanakya"):
        text = font_tag.get_text(separator='\n', strip=True)
        
        # Skip the title if it repeats in the font tags
        if title in text:
            continue
            
        # Skip purely numeric verse numbers (like '1', '2', '3')
        if text.isdigit():
            continue
            
        poem_lines.append(text)
        print(f"\n{text}")

    if not poem_lines:
        print("FAILED: No poem lines extracted. Check the 'face' attribute in HTML.")

# Run the test
test_single_scrape(test_link)

Testing Scrape on: https://www.tamilvu.org/slet/l9100/l9100pd1.jsp?bookid=145&pno=2
------------------------------
TITLE FOUND: பாமாலை 
          : பக்தி பாடல்கள்

CONTENT FOUND:

தோத்திரப் பாடல்கள்
ஆறு துணை

ஓம்சக்தி ஓம்சக்தி ஓம் 
      -- பராசக்தி
ஓம்சக்தி ஓம்சக்தி ஓம்.
ஓம்சக்தி ஓம்சக்தி ஓம்சக்தி -- ஓம்சக்தி
ஓம்சக்தி ஓம்சக்தி ஓம்.

கணபதி ராயன் -- அவனிரு
காலைப் பிடித் திடுவோம்
குணமுயர்ந் திடவே -- விடுதலை
கூடி மகிழ்ந் திடவே

(
ஓம்சக்தி 
      ஓம்சக்தி ஓம்
)

சொல்லுக் கடங்காவே -- பராசக்தி
சூரத் தனங்க ளெல்லாம்;
வல்லமை தந்திடுவாள் -- பராசக்தி
வாழி யென்றே துதிப்போம்.

(
ஓம்சக்தி 
      ஓம்சக்தி ஓம்
)

வெற்றி வடிவேலன் -- அவனுடை
வீரத்தினைப் புகழ்வோம்;
சுற்றிநில் லாதேபோ! -- பகையே!
துள்ளி வருகுதுவேல்.

(
ஓம்சக்தி 
      ஓம்சக்தி ஓம்
)

தாமரைப் பூவினிலே -- சுருதியைத்
தனியிருந் துரைப்பாள்
பூமணித் தாளினையே -- கண்ணிலொற்றிப்
புண்ணிய மெய்திடுவோம்.

(
ஓம்சக்தி 
      ஓம்சக்தி ஓம்
)

பாம்புத் தலைமேலே -- நடஞ் செயும்
பாதத்தினைப் புகழ் வோம்
மாம்பழ வாயினிலே -- குழலிடஞ
வண்மை புகழ்ந்திடுவோம்.

(
ஓம்சக்தி 
 