In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

stories_data = []

# Loop through each month and day
for month in range(1, 4):
    if month in [1, 3, 5, 7, 8, 10, 12]:
        n_days = 31
    elif month in [4, 6, 9, 11]:
        n_days = 30
    else:
        n_days = 28  # February, not accounting for leap years

    for day in range(1, n_days + 1):
        # Format month and day to two digits
        month_str = f'{month:02}'
        day_str = f'{day:02}'

        date = f'{month_str}/{day_str}/2019'
        url = f'https://medium.com/swlh/archive/2019/{month_str}/{day_str}'

        # Request the page
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')

        # Find all stories
        stories = soup.find_all('div', class_='streamItem streamItem--postPreview js-streamItem')
        
        for story in stories:
            each_story = {}

            # Find author box
            author_box = story.find('div', class_='postMetaInline u-floatLeft u-sm-maxWidthFullWidth')
            if author_box:
                author_url = author_box.find('a')['href']
            else:
                author_url = 'N/A'

            # Get reading time
            try:
                reading_time = author_box.find('span', class_='readingTime')['title']
            except (TypeError, AttributeError):
                reading_time = 'N/A'

            # Get title and subtitle
            title = story.find('h3').text if story.find('h3') else '-'
            subtitle = story.find('h4').text if story.find('h4') else '-'

            # Get claps
            claps_button = story.find('button', class_='button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents')
            claps = claps_button.text if claps_button else '0'

            # Get responses
            responses_button = story.find('a', class_='button button--chromeless u-baseColor--buttonNormal')
            responses = responses_button.text if responses_button else '0 responses'

            # Get story URL
            story_element = story.find('a', class_='button button--smaller button--chromeless u-baseColor--buttonNormal')
            story_url = story_element['href'] if story_element else 'N/A'

            # Clean up reading time and responses
            reading_time = reading_time.split()[0] if reading_time != 'N/A' else 'N/A'
            responses = responses.split()[0] if responses != '0 responses' else '0'

            # Store the data in a dictionary
            each_story = {
                'date': date,
                 
                'title': title,
                'subtitle': subtitle,
                'claps': claps,
                'responses': responses,
                'story_url': story_url
            }

            # Append the story data to the list
            stories_data.append(each_story)

# Convert the list of stories to a DataFrame
stories_df = pd.DataFrame(stories_data)

# Optionally, save to a CSV file
stories_df.to_csv('medium_stories_2019.csv', index=False)

print("Scraping completed. Data saved to medium_stories_2019.csv.")

Scraping completed. Data saved to medium_stories_2019.csv.


In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re

# URL of the webpage you want to scrape
url = 'https://www.projectmadurai.org/pm_etexts/utf8/pmuni0001.html'  # Replace with the actual URL

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract text from the webpage
    text = soup.get_text()

    # Regular expression to match Tamil characters
    tamil_pattern = re.compile(r'[\u0B80-\u0BFF]+')  # Unicode range for Tamil characters

    # Find all Tamil letters in the text
    tamil_letters = tamil_pattern.findall(text)

    # Join the list of Tamil letters into a single string
    tamil_text = ' '.join(tamil_letters)

    # Print the extracted Tamil text
    print(tamil_text)

    # Create a DataFrame from the extracted Tamil text
    # Convert the string into a list of lines (or sentences) for better CSV formatting
    tamil_text_list = tamil_text.splitlines()
    stories_df = pd.DataFrame(tamil_text_list, columns=['Tamil Text'])

    # Save the DataFrame to a CSV file
    stories_df.to_csv('tamil.csv', index=False)

    print("Scraping completed. Data saved to tamil.csv.")
else:
    print(f'Failed to retrieve data: {response.status_code}')

Scraping completed. Data saved to tamil_lines.csv.


In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re

# URL of the webpage you want to scrape
url = 'https://www.projectmadurai.org/pm_etexts/utf8/pmuni0001.html'  # Replace with the actual URL

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract text from the webpage
    text = soup.get_text()

    # Regular expression to match Tamil characters
    tamil_pattern = re.compile(r'[\u0B80-\u0BFF]+')  # Unicode range for Tamil characters

    # Split the text into lines
    lines = text.splitlines()

    # List to hold Tamil lines
    tamil_lines = []

    # Iterate through each line and extract Tamil text
    for line in lines:
        # Find all Tamil letters in the line
        tamil_letters = tamil_pattern.findall(line)
        if tamil_letters:  # If there are Tamil letters in the line
            # Join the found Tamil letters and add to the list
            tamil_lines.append(' '.join(tamil_letters))

    # Create a DataFrame from the list of Tamil lines
    stories_df = pd.DataFrame(tamil_lines, columns=['Tamil Text'])

    # Save the DataFrame to a CSV file
    stories_df.to_csv('tamil_lines.csv', index=False)

    print("Scraping completed. Data saved to tamil_lines.csv.")
else:
    print(f'Failed to retrieve data: {response.status_code}')

[]