In [7]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

def fetch_webpage(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        return response.content
    except requests.exceptions.HTTPError as e:
        print(f"Failed to fetch {url}: {e}")
        return ''

def extract_text_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    entry_content = soup.find(class_='entry-content')
    if entry_content:
        for script_or_style in entry_content(['script', 'style']):
            script_or_style.decompose()  # Remove these tags and their content
        return ' '.join(entry_content.stripped_strings)
    else:
        return ''

def count_words(text):
    words = re.findall(r'\w+', text.lower())  # Find all words and convert to lower case
    return len(words)

def process_urls(urls):
    word_counts = []
    for url in urls:
        try:
            html_content = fetch_webpage(url)
            text = extract_text_from_html(html_content)
            word_count = count_words(text)
            word_counts.append((url, word_count))
            print(f"{url}: {word_count}")
        except Exception as e:
            print(f"Failed to process {url}: {e}")
            word_counts.append((url, 0))
    return word_counts

if __name__ == "__main__":
    urls =[
    "https://phamconsult.com/thoi-han-ke-khai-thue-tncn-tu-dau-tu-von-nam-2023/?lang=vi",
    "https://phamconsult.com/gioi-thieu-cong-ty/?lang=vi",
    "https://phamconsult.com/outsourced-bookkeeping/",
    "https://phamconsult.com/?lang=vi",
    "https://phamconsult.com/tax-agent/",
    "https://phamconsult.com/quy-dinh-thue-gtgt-trong-xay-dung/?lang=vi",
    "https://phamconsult.com/payroll-service/",
    "https://phamconsult.com/quyet-toan-thue-tncn-cho-nnn/?lang=vi",
    "https://phamconsult.com/tuyen-dung/?lang=vi",
    "https://phamconsult.com/pit-finalization-for-foreigners/",
    "https://phamconsult.com/",
    "https://phamconsult.com/trinh-tu-thu-tuc-dong-cua-van-phong-dai-dien-cua-thuong-nhan-nuoc-ngoai/?lang=vi",
    "https://phamconsult.com/co-can-quyet-toan-lai-so-thue-tncn-da-phan-bo-cho-chi-nhanh-ngoai-tinh/?lang=vi",
    "https://phamconsult.com/quyet-dinh-970-qd-tct-thay-moi-quy-trinh-kiem-tra-thue/?lang=vi",
    "https://phamconsult.com/gia-han-thoi-gian-gop-von-cho-nha-dau-tu-nuoc-ngoai-nhu-the-nao-2/?lang=vi",
    "https://phamconsult.com/work-permits-for-foreigners-2023/",
    "https://phamconsult.com/procedures-for-applying-for-tt-temporary-residence-card-in-vietnam/",
    "https://phamconsult.com/retail-business-in-vietnam-for-foreign-investors/",
    "https://phamconsult.com/nguoi-lao-dong-vi-pham-quy-dinh-ve-thong-bao-truoc-khi-nghi-viec-thi-cong-ty-co-duoc-khau-tru-tien-luong-khong/?lang=vi",
    "https://phamconsult.com/dich-vu/lap-chien-luoc-va-ke-hoach-kinh-doanh/?lang=vi",
    "https://phamconsult.com/lien-he/?lang=vi",
    "https://phamconsult.com/tax-services/",
    "https://phamconsult.com/about-us/",
    "https://phamconsult.com/procedures-for-temporarily-suspending-business-operations/",
    "https://phamconsult.com/jobs/accounting-and-tax-manager/",
    "https://phamconsult.com/khoan-dong-bao-hiem-khong-duoc-coi-la-khoan-giam-tru-khi-xac-dinh-thu-nhap-tinh-thue-doi-voi-tien-luong-tien-cong/?lang=vi",
    "https://phamconsult.com/dich-vu/dich-vu-ke-toan/?lang=vi",
    "https://phamconsult.com/accounting-services/",
    "https://phamconsult.com/kiem-ke-la-gi-don-vi-ke-toan-phai-kiem-ke-tai-san-trong-truong-hop-nao/?lang=vi",
    "https://phamconsult.com/procedure-for-amendment-chief-of-representative-office-of-foreign-traders/",
    "https://phamconsult.com/does-performance-bonus-include-compulsory-social-insurance-and-personal-income-tax/",
    "https://phamconsult.com/services/new-procedures-for-opening-representative-office/",
    "https://phamconsult.com/services/tax-consulting-services/",
    "https://phamconsult.com/some-notes-when-finalizing-personal-income-tax-in-2022-2/",
    "https://phamconsult.com/the-minimum-storage-term-of-accounting-vouchers-is-in-accordance-with-current-law/",
    "https://phamconsult.com/wp-content/uploads/2018/12/1-Tax-Alert-the-changes-on-mandatory-social-insurance.pdf",
    "https://phamconsult.com/training-services/?lang=vi",
    "https://phamconsult.com/dich-vu/nhung-diem-dang-luu-y-khi-dong-cua-van-phong-dai-dien-cua-thuong-nhan-nuoc-ngoai/?lang=vi",
    "https://phamconsult.com/thu-tuc-xin-cap-the-tam-tru-cho-nha-dau-tu-nuoc-ngoai-tai-viet-nam/?lang=vi",
    "https://phamconsult.com/jobs/nhan-vien-ho-tro-tuyen-dung-va-hanh-chinh/",
    "https://phamconsult.com/jobs/nhan-vien-thuc-tap-vien-cb/?lang=vi",
    "https://phamconsult.com/huy-chung-tu-ke-toan-dien-tu-la-gi-viec-huy-chung-tu-ke-toan-dien-tu-tai-ngan-hang-nha-nuoc-duoc-thuc-hien-nhu-the-nao/?lang=vi",
    "https://phamconsult.com/doanh-nghiep-nop-bao-cao-tai-chinh-nam-sau-ngay-31-thang-12-cua-nam-duong-lich-trong-thoi-han-bao-lau-thi-khong-bi-phat/?lang=vi",
    "https://phamconsult.com/mot-so-luu-y-khi-quyet-toan-tndn-nam-2022/?lang=vi",
    "https://phamconsult.com/how-much-is-the-fee-for-applying-for-registration-of-industrial-property-rights-from-january-1-2024-according-to-the-regulations/",
    "https://phamconsult.com/what-are-the-steps-involved-in-the-audit-process-of-corporate-financial-statements/",
    "https://phamconsult.com/thong-bao-thay-doi-dia-chi-van-phong/?lang=vi",
    "https://phamconsult.com/forms-conditions-and-procedures-for-investment-in-the-form-of-capital-contribution-share-purchase-purchase-of-capital-contribution-from-foreign-investors/",
    "https://phamconsult.com/accounting-firms/",
    "https://phamconsult.com/contact-us/?lang=vi",
    "https://phamconsult.com/dich-vu/tu-van-tai-cau-truc-doanh-nghiep/?lang=vi",
    "https://phamconsult.com/wp-content/uploads/2018/12/Form-chuyen-tien_Mat-sau_New.pdf",
    "https://phamconsult.com/nguoi-duoc-cap-chung-chi-hanh-nghe-dich-vu-lam-thu-tuc-ve-thue-khong-bat-buoc-phai-cap-nhat-kien-thuc-trong-truong-hop-nao/?lang=vi",
    "https://phamconsult.com/quy-trinh-quan-ly-rui-ro-ve-rua-tien-va-phan-loai-khach-hang-theo-muc-do-rui-ro-ve-rua-tien/?lang=vi",
    "https://phamconsult.com/jobs/tro-ly-phap-ly-legal-assistant/",
    "https://phamconsult.com/how-to-do-when-foreign-workers-lost-work-permits/",
    "https://phamconsult.com/jobs/hr-admin-trainee/",
    "https://phamconsult.com/thu-tuc-tam-ngung-hoat-dong-doanh-nghiep/?lang=vi",
    "https://phamconsult.com/muc-thue-suat-thue-tndn-2023/?lang=vi",
    "https://phamconsult.com/procedures-for-applying-for-temporary-residence-card-for-foreign-investors-in-vietnam/",
    "https://phamconsult.com/personal-income-tax-refund-cases/",
    "https://phamconsult.com/work-permit-exemption/",
    "https://phamconsult.com/jobs/senior-legal-consultant-legal-counsel/",
    "https://phamconsult.com/procedures-for-extending-work-permits-for-foreigners-in-2024/",
    "https://phamconsult.com/thong-diep-va-cam-ket/?lang=vi",
    "https://phamconsult.com/job-category/payroll-intern/",
    "https://phamconsult.com/jobs/tax-consultant-senior/?lang=vi",
    "https://phamconsult.com/cong-ty-chung-khoan-moi-thanh-lap-thi-phai-mo-so-ke-toan-vao-thoi-diem-nao/?lang=vi",
    "https://phamconsult.com/services/the-best-guidance-for-ma-procedures/",
    "https://phamconsult.com/what-is-continuous-order-matching-on-what-principles-does-the-stock-exchange-organize-stock-trading-according-to-the-centralized-order-matching-method/",
    "https://phamconsult.com/jobs/legal-assistant/?lang=vi",
    "https://phamconsult.com/decree-11-2024-nd-cd-regulations-on-corporate-income-tax-exemptions-and-personal-income-tax-exemptions-in-hcmc-2/",
    "https://phamconsult.com/khi-don-vi-ke-toan-thay-doi-nguoi-lam-ke-toan-thi-nguoi-nay-co-phai-chiu-trach-nhiem-voi-don-vi-nua-khong/?lang=vi",
    "https://phamconsult.com/form_chuyen-tien-quoc-te-vietcombank/",
    "https://phamconsult.com/converting-loan-capital-into-contributed-capital/",
    "https://phamconsult.com/jobs/pham-consult-accounting-internship/",
    "https://phamconsult.com/ca-nhan-cho-thue-tai-san-khong-phai-nop-thue-thu-nhap-ca-nhan-thue-gia-tri-gia-tang-trong-truong-hop-nao/?lang=vi",
    "https://phamconsult.com/tin-tuc/?lang=vi",
    "https://phamconsult.com/truong-hop-hoan-thue-tncn/?lang=vi",
    "https://phamconsult.com/procedures-for-applying-for-an-investment-visa-for-foreign-investors-in-vietnam/",
    "https://phamconsult.com/how-to-extend-capital-contribution-time-for-foreign-investors-2/",
    "https://phamconsult.com/contact-us/",
    "https://phamconsult.com/decision-970-qd-tct-replaces-tax-inspection-process/",
    "https://phamconsult.com/job-opportunity/",
    "https://phamconsult.com/jobs/business-development-executive/?lang=vi",
    "https://phamconsult.com/are-business-required-to-bonus-women-employees-on-vietnamese-womens-day-october-20/",
    "https://phamconsult.com/jobs/payroll-specialist/?lang=vi",
    "https://phamconsult.com/services/highlight-factors-for-closing-representative-offices-of-foreign-investors/",
    "https://phamconsult.com/increase-the-contribution-capital-of-two-members-limited-liability-company-with-foreign-investment-capital/",
    "https://phamconsult.com/cac-diem-moi-khi-xin-giay-phep-lao-dong-cho-lao-dong-nuoc-ngoai/?lang=vi",
    "https://phamconsult.com/tien-thuong-hieu-qua-cong-viec-co-tinh-dong-bao-hiem-xa-hoi-bat-buoc-va-thue-thu-nhap-ca-nhan-hay-khong/?lang=vi",
    "https://phamconsult.com/some-notes-when-finalizing-corporate-income-in-2022/",
    "https://phamconsult.com/regulations-on-vat-of-construction/",
    "https://phamconsult.com/services/professional-business-consulting-advisory-services/",
    "https://phamconsult.com/doanh-nghiep-khong-du-dieu-kien-kinh-doanh-dich-vu-kiem-toan-thi-khong-duoc-su-dung-cum-tu-kiem-toan-trong-ten-goi-khong/?lang=vi",
    "https://phamconsult.com/services/corporate-finance/",
    "https://phamconsult.com/document-application-for-identification-of-foreign-workers-working-in-vietnam-not-subject-to-work-permit/",
    "https://phamconsult.com/jobs/pham-consult-auditing-internship/",
    "https://phamconsult.com/forms-of-foreign-investors-into-vietnam/",
    "https://phamconsult.com/state-bank-requirements-for-reducing-loans-interest-from-1-5-2-year/"
]


    # word_counts = process_urls(urls)
    # print(f"Word counts for all URLs: {word_counts}")

    counted_urls = len(word_counts)
    print(f"Number of URLs counted: {counted_urls}")

    # Export to Excel
    df = pd.DataFrame(word_counts, columns=['URL', 'Word Count'])
    df.to_excel('word_counts.xlsx', index=False)
    print("Word counts have been exported to word_counts.xlsx")


Number of URLs counted: 100
Word counts have been exported to word_counts.xlsx
