In [1]:
import csv
import requests
from bs4 import BeautifulSoup

# Function to scrape webpage content
def scrape_webpage(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extracting drug name
            drug_name = soup.find(class_='prodName').text.strip()

            # Extracting various sections
            introduction = extract_section(soup, 'np_tab1')
            uses = extract_section(soup, 'np_tab3')
            how_it_works = extract_section(soup, 'np_tab5')
            directions_for_use = extract_section(soup, 'np_tab6')
            side_effects = extract_section(soup, 'np_tab7')
            manage_side_effects = extract_section(soup, 'np_tab8')
            warnings_precautions = extract_section(soup, 'np_tab9')
            interactions = extract_section(soup, 'np_tab11')
            synopsis = extract_section(soup, 'np_tab12')
            more_information = extract_section(soup, 'np_tab13')
            faqs = extract_faq(soup, 'np_tab14')
            references = extract_section(soup, 'np_tab15')
            useful_diagnostic_tests = extract_section(soup, 'np_tab17')
            author_details = extract_section(soup, 'np_tab18')

            return {
                'DRUG NAME': drug_name,
                'INTRODUCTION': introduction,
                'USES': uses,
                'HOW IT WORKS': how_it_works,
                'DIRECTIONS FOR USE': directions_for_use,
                'SIDE EFFECTS': side_effects,
                'HOW TO MANAGE SIDE EFFECTS': manage_side_effects,
                'WARNING & PRECAUTIONS': warnings_precautions,
                'INTERACTIONS': interactions,
                'SYNOPSIS': synopsis,
                'MORE INFORMATION': more_information,
                'FAQs': faqs,
                'REFERENCES': references,
                'USEFUL DIAGNOSTIC TESTS': useful_diagnostic_tests,
                'AUTHOR DETAILS': author_details
            }
        else:
            print(f"Failed to fetch {url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

# Function to extract section from webpage
def extract_section(soup, section_id):
    section = soup.find(id=section_id)
    if section:
        return section.text.strip()
    return ''

# Function to extract FAQs from webpage
def extract_faq(soup, section_id):
    faq_section = soup.find(id=section_id)
    if faq_section:
        faqs = {}
        faq_items = faq_section.find_all('p')
        for faq_item in faq_items:
            if faq_item.find('strong'):
                question = faq_item.find('strong').text.strip()
                answer = faq_item.find_next_sibling('p').text.strip()
                faqs[question] = answer
        return faqs
    return {}

# Function to read URLs from CSV file and scrape webpages
def scrape_from_csv(csv_file, output_file):
    with open(csv_file, 'r', newline='') as file:
        with open(output_file, 'w', newline='', encoding='utf-8') as output_csv:
            fieldnames = ['DRUG NAME', 'INTRODUCTION', 'USES', 'HOW IT WORKS', 'DIRECTIONS FOR USE', 'SIDE EFFECTS',
                          'HOW TO MANAGE SIDE EFFECTS', 'WARNING & PRECAUTIONS', 'INTERACTIONS', 'SYNOPSIS',
                          'MORE INFORMATION', 'FAQs', 'REFERENCES', 'USEFUL DIAGNOSTIC TESTS', 'AUTHOR DETAILS']
            writer = csv.DictWriter(output_csv, fieldnames=fieldnames)
            writer.writeheader()
            
            reader = csv.DictReader(file)
            for row in reader:
                url = row['source2']
                print(f"Scraping {url}...")
                data = scrape_webpage(url)
                if data:
                    writer.writerow(data)
                else:
                    writer.writerow({key: '' for key in fieldnames})

# Example usage
if __name__ == "__main__":
    input_csv = 'D:\\ThinkByte_project\Medibuddy_project\data\med_urls.csv'  # CSV file containing URLs
    output_csv = 'source2_extracted_data.csv'  # Output CSV file to save extracted data
    scrape_from_csv(input_csv, output_csv)
    print("Extraction completed. Data saved to extracted_data.csv")


Scraping https://www.netmeds.com/prescriptions/nikoran-5mg-tablet-20-s...
Scraping https://www.netmeds.com/prescriptions/benadryl-cough-formula-syrup-450ml...
Scraping https://www.netmeds.com/prescriptions/benadryl-cough-formula-syrup-150ml...
Extraction completed. Data saved to extracted_data.csv


: 