In [None]:
import pandas as pd
import os
import glob
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import re

In [None]:
# Specify the directory containing Excel files
directory = r"C:\Users\Nasip-PC\Desktop\Dosyalar\Kodlar\NLP\Data\Raw"

# Get all Excel files in the directory
xlsx_files = glob.glob(os.path.join(directory, "*.xlsx"))

In [None]:
# Function to fetch webpage text
def fetch_webpage_text(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}  # Some websites block requests without user-agent
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        # Parse the HTML content
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract text from the page
        text = soup.get_text(separator=" ", strip=True)
        return text

    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None  # Return None if there was an error
    

def extract_values(text, keys):
    if not isinstance(text, str):  # Ensure it's a string
        return {key: None for key in keys}

    extracted_data = {}

    # Normalize text by removing extra whitespace (optional but helpful)
    text = re.sub(r'\s+', ' ', text)

    for i, key in enumerate(keys):
        next_key = keys[i + 1] if i + 1 < len(keys) else None

        # Create pattern to extract value between key and next_key
        if next_key:
            pattern = rf"{re.escape(key)}\s*(.*?)(?={re.escape(next_key)})"
        else:
            pattern = rf"{re.escape(key)}\s*(.*)"

        match = re.search(pattern, text, re.DOTALL)
        value = match.group(1).strip() if match else None

        extracted_data[key] = value

    return extracted_data


In [None]:
# Process each Excel file
for xlsx_file in tqdm(xlsx_files, desc="Processing Excel Files"):
    try:
        # Read the Excel file
        df = pd.read_excel(xlsx_file)

        # Check if 'WEB_ADDRESS' column exists
        if "WEB_ADDRESS" not in df.columns:
            print(f"Skipping {xlsx_file} (No 'WEB_ADDRESS' column found)")
            continue

        # Enable tqdm for pandas
        tqdm.pandas()

        # Fetch webpage text
        df["PAGE_TEXT"] = df["WEB_ADDRESS"].progress_apply(fetch_webpage_text)

        # Define new file name
        new_file_name = os.path.join(r"C:\Users\Nasip-PC\Desktop\Dosyalar\Kodlar\NLP\Data\Processed",xlsx_file.split("\\")[-1])

        # Save to Excel
        df.to_excel(new_file_name, index=False)
        print(f"Saved processed file: {new_file_name}")

    except Exception as e:
        print(f"Error processing {xlsx_file}: {e}")

print("Processing complete!")


In [56]:
# Get all Excel files in the directory
xlsx_files = glob.glob(os.path.join(r"C:\Users\Nasip-PC\Desktop\Dosyalar\Kodlar\NLP\Data\Processed", "*.xlsx"))
# Define the keys to extract
keys = [
    "Date Initiated by Firm",
    "Date Posted",
    "Recall Status",
    "Recall Number",
    "Recall Event ID",
    "510(K) Number",
    "Product Classification",
    "Product",
    "Code Information",
    "Recalling Firm/Manufacturer",
    "Manufacturer Reason for Recall",
    "FDA Determined Cause",
    "Action",
    "Quantity in Commerce",
    "Distribution",
    "Total Product Life Cycle",
    "Page Last Updated"
]

In [57]:
# Process each Excel file
for xlsx_file in tqdm(xlsx_files, desc="Processing Excel Files"):
    try:
        # Read the Excel file
        df = pd.read_excel(xlsx_file)

        # Apply the extraction function to PAGE_TEXT
        df_extracted = df["PAGE_TEXT"].apply(lambda text: extract_values(text, keys)).apply(pd.Series)

        # Merge extracted columns back
        df = pd.concat([df, df_extracted], axis=1)

        # Save to Excel
        df.to_excel(xlsx_file, index=False)
        print(f"Saved processed file: {new_file_name}")

    except Exception as e:
        print(f"Error processing {xlsx_file}: {e}")

print("Processing complete!")


Processing Excel Files:  33%|███▎      | 2/6 [00:00<00:00,  5.87it/s]

Saved processed file: C:\Users\Nasip-PC\Desktop\Dosyalar\Kodlar\NLP\Data\Raw\SoftwreChangeControl_processed.xlsx
Saved processed file: C:\Users\Nasip-PC\Desktop\Dosyalar\Kodlar\NLP\Data\Raw\SoftwreChangeControl_processed.xlsx


Processing Excel Files:  67%|██████▋   | 4/6 [00:01<00:00,  3.85it/s]

Saved processed file: C:\Users\Nasip-PC\Desktop\Dosyalar\Kodlar\NLP\Data\Raw\SoftwreChangeControl_processed.xlsx
Saved processed file: C:\Users\Nasip-PC\Desktop\Dosyalar\Kodlar\NLP\Data\Raw\SoftwreChangeControl_processed.xlsx


Processing Excel Files: 100%|██████████| 6/6 [00:01<00:00,  4.91it/s]

Saved processed file: C:\Users\Nasip-PC\Desktop\Dosyalar\Kodlar\NLP\Data\Raw\SoftwreChangeControl_processed.xlsx
Saved processed file: C:\Users\Nasip-PC\Desktop\Dosyalar\Kodlar\NLP\Data\Raw\SoftwreChangeControl_processed.xlsx


Processing Excel Files: 100%|██████████| 6/6 [00:01<00:00,  4.49it/s]

Processing complete!





In [58]:
df = pd.read_excel(r"C:\Users\Nasip-PC\Desktop\Dosyalar\Kodlar\NLP\Data\Processed\SoftwareDesign_processed.xlsx")

In [59]:
df

Unnamed: 0,WEB_ADDRESS,RECALL_NUMBER,PRODUCT_DESCRIPTION,TRADE_NAME,RECALL_CLASS,CENTER,CENTER_CLASSIFICATION_DT,POSTED_INTERNET_DT,TERMINATION_DT,FIRM_NAME,...,Product,Code Information,Recalling Firm/Manufacturer,Manufacturer Reason for Recall,FDA Determined Cause,Action,Quantity in Commerce,Distribution,Total Product Life Cycle,Page Last Updated
0,http://www.accessdata.fda.gov/scripts/cdrh/cfd...,Z-1421-2025,"Baxter TruSystem 7500 Hybrid MR IMRIS, Product...",Baxter,2,CDRH,2025-03-21,2025-03-21 21:06:30,NaT,Baxter Healthcare Corporation,...,"s Vaccines, Blood & Biologics Animal & Veterin...",,,There is a software issue which causes the upp...,2 Software design,Baxter issued an UREGENT MEDICAL DEVICE CORREC...,24 units,US Nationwide distribution.,TPLC Device Report 1 A record in this database...,: 03/21/2025 Note: If you need help accessing ...
1,http://www.accessdata.fda.gov/scripts/cdrh/cfd...,Z-1419-2025,"Baxter TruSystem 7500 Hybrid Plus (FC), Produc...",TS 7500,2,CDRH,2025-03-21,2025-03-21 21:06:30,NaT,Baxter Healthcare Corporation,...,"s Vaccines, Blood & Biologics Animal & Veterin...",,,There is a software issue which causes the upp...,2 Software design,Baxter issued an UREGENT MEDICAL DEVICE CORREC...,93 units,US Nationwide distribution.,TPLC Device Report 1 A record in this database...,: 03/21/2025 Note: If you need help accessing ...
2,http://www.accessdata.fda.gov/scripts/cdrh/cfd...,Z-1411-2025,"Baxter Floor mounting column TruSystem 7500, P...",TS 7500,2,CDRH,2025-03-21,2025-03-21 21:06:30,NaT,Baxter Healthcare Corporation,...,"s Vaccines, Blood & Biologics Animal & Veterin...",,,There is a software issue which causes the upp...,2 Software design,Baxter issued an UREGENT MEDICAL DEVICE CORREC...,112 units,US Nationwide distribution.,TPLC Device Report 1 A record in this database...,: 03/21/2025 Note: If you need help accessing ...
3,http://www.accessdata.fda.gov/scripts/cdrh/cfd...,Z-1414-2025,"Baxter Stationary column TruSystem 7500 U, Pro...",TS 7500,2,CDRH,2025-03-21,2025-03-21 21:06:30,NaT,Baxter Healthcare Corporation,...,"s Vaccines, Blood & Biologics Animal & Veterin...",,,There is a software issue which causes the upp...,2 Software design,Baxter issued an UREGENT MEDICAL DEVICE CORREC...,2 units,US Nationwide distribution.,TPLC Device Report 1 A record in this database...,: 03/21/2025 Note: If you need help accessing ...
4,http://www.accessdata.fda.gov/scripts/cdrh/cfd...,Z-1415-2025,"Baxter Floor mounting column TS 7500 U, Produc...",TS 7500,2,CDRH,2025-03-21,2025-03-21 21:06:30,NaT,Baxter Healthcare Corporation,...,"s Vaccines, Blood & Biologics Animal & Veterin...",,,There is a software issue which causes the upp...,2 Software design,Baxter issued an UREGENT MEDICAL DEVICE CORREC...,33 units,US Nationwide distribution.,TPLC Device Report 1 A record in this database...,: 03/21/2025 Note: If you need help accessing ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,http://www.accessdata.fda.gov/scripts/cdrh/cfd...,Z-1685-2021,Multitom RAX with software version VF10,Multitom RAX,2,CDRH,2021-06-02,NaT,2023-11-08 10:03:16,"Siemens Medical Solutions USA, Inc",...,,,,,,,,,,
496,http://www.accessdata.fda.gov/scripts/cdrh/cfd...,Z-1772-2021,MiniMed 670G Insulin Pump:\nModels:\nPUMP MMT-...,MiniMed 670G Insulin Pump,2,CDRH,2021-06-01,NaT,NaT,Medtronic Minimed,...,,,,,,,,,,
497,http://www.accessdata.fda.gov/scripts/cdrh/cfd...,Z-1771-2021,MiniMed 780G Insulin Pump:\nModels:\nPUMP MMT-...,"MiniMed"" 780G Insulin Pump",2,CDRH,2021-06-01,NaT,NaT,Medtronic Minimed,...,,,,,,,,,,
498,http://www.accessdata.fda.gov/scripts/cdrh/cfd...,Z-1692-2021,"Critical Care Ventilator, Catalog Number(s): 8...",Babylog VN600 Ventilator,2,CDRH,2021-05-28,NaT,2024-02-06 08:42:57,"Draeger Medical, Inc.",...,,,,,,,,,,


In [53]:
df["WEB_ADDRESS"].value_counts()

WEB_ADDRESS
http://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfres/res.cfm?id=212489    1
http://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfres/res.cfm?id=193552    1
http://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfres/res.cfm?id=192851    1
http://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfres/res.cfm?id=192627    1
http://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfres/res.cfm?id=192103    1
                                                                            ..
http://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfres/res.cfm?id=207143    1
http://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfres/res.cfm?id=207133    1
http://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfres/res.cfm?id=207144    1
http://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfres/res.cfm?id=207141    1
http://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfres/res.cfm?id=187116    1
Name: count, Length: 500, dtype: int64