In [None]:
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from collections import Counter

# Reading the CSV file
df = pd.read_csv('Dataset_Company_Links.csv')

# Extracting the 'Website' column as a list
websites = df['Website'].tolist()

formatted_websites = [
    site if site.startswith("http") else f"https://{site}" for site in websites
]

keywords_f_and_b = [
    "food", "beverage", "catering", "culinary", "restaurant", 
    "cereal", "milk", "dairy", "snacks", "meal", "menu", 
    "nutrition", "organic food", "fast food", "packaged food", 
    "soft drink", "dessert", "gourmet", "hospitality", "cuisine"
]

keywords_brands = [
    "brand", "logo", "trademark", "branding", "name recognition", 
    "marketing", "product name", "advertising", "brand value", 
    "brand identity", "brand equity", "endorsement", "slogan", 
    "brand reputation", "campaign", "label", "icon"
]

keywords_manufacturers = [
    "manufacture", "production", "factory", "assembly", "plant", 
    "industrial", "fabrication", "processing", "machinery", "equipment", 
    "assembly line", "mass production", "workshop", "automated systems", 
    "engineering", "raw materials", "production line", "supplier network", 
    "batch production"
]

keywords_distributors = [
    "distributor", "distribution", "wholesale", "supplier", "logistics", 
    "shipment", "supply chain", "delivery", "warehouse", "inventory", 
    "transportation", "freight", "retailer", "stockist", "merchandise", 
    "cargo", "logistics network", "fulfillment", "packing", "courier"
]

keywords_womens_health = [
    "women's health", "female health", "gynecology", "reproductive health", 
    "maternal health", "hormonal health", "menstrual cycle", "menopause", 
    "pregnancy", "fertility", "breast health", "ovarian health", 
    "uterine health", "childbirth", "prenatal care", "postpartum", 
    "sexual health", "contraception", "PCOS", "endometriosis"
]

keywords_gut_health = [
    "gut health", "digestive health", "probiotics", "prebiotics", 
    "microbiome", "intestinal health", "digestive system", "gut flora", 
    "gut bacteria", "colon health", "stomach health", "IBS", 
    "digestive enzymes", "fiber", "bowel movement", "gastric health", 
    "digestive balance", "intestinal lining", "acid reflux", "dietary fiber"
]

keywords_cognitive_health = [
    "cognitive health", "brain health", "mental health", "memory", 
    "cognition", "neurological health", "brain function", "learning", 
    "focus", "attention", "neuroplasticity", "Alzheimer's", "dementia", 
    "mental clarity", "IQ", "thinking", "problem-solving", "decision making", 
    "concentration", "psychological health"
]

keywords_probiotics = [
    "probiotics", "strains of bacteria", "lactobacillus", "bifidobacterium", 
    "fermentation", "gut-friendly bacteria", "good bacteria", 
    "yogurt cultures", "microbial strains", "live bacteria", 
    "fermented food", "digestive aid", "beneficial microorganisms", 
    "prebiotic synergy", "gut microbiota", "gut-friendly strains"
]

keywords_fortification = [
    "fortification", "nutrient enrichment", "vitamin fortification", 
    "mineral fortification", "fortified food", "iron fortification", 
    "calcium fortification", "DHA fortification", "nutritional additives", 
    "supplemented food", "enhanced food", "food enrichment", 
    "fortified milk", "iodized salt", "fortified flour", "omega-3", 
    "nutritional benefits", "functional food", "healthier food", "food innovation"
]

# Initialize the WebDriver
driver = webdriver.Chrome()

# Folder to save HTML files
output_folder = "webpage_htmls"
os.makedirs(output_folder, exist_ok=True)

# DataFrame to store results
results_df = pd.DataFrame(columns=["Website", "F and B", "Brand", "Manufacturer", "Distributor", "Women's Health", "Gut Health", "Cognitive Health", "Probiotics", "Fortification", "Relevant"])

def classify_website(url):
    try:
        driver.get(url)

        # Save the webpage HTML
        page_source = driver.page_source
        file_name = os.path.join(output_folder, f"{url.replace('https://', '').replace('/', '_')}.html")
        with open(file_name, "w", encoding="utf-8") as file:
            file.write(page_source)

        # Search for keywords in the webpage content
        page_source = page_source.lower()
        keyword_counts = Counter()

        for keyword in keywords_f_and_b:
            keyword_counts["F and B"] += page_source.count(keyword.lower())

        for keyword in keywords_brands:
            keyword_counts["Brand"] += page_source.count(keyword.lower())

        for keyword in keywords_manufacturers:
            keyword_counts["Manufacturer"] += page_source.count(keyword.lower())

        for keyword in keywords_distributors:
            keyword_counts["Distributor"] += page_source.count(keyword.lower())

        for keyword in keywords_womens_health:
            keyword_counts["Women's Health"] += page_source.count(keyword.lower())

        for keyword in keywords_gut_health:
            keyword_counts["Gut Health"] += page_source.count(keyword.lower())

        for keyword in keywords_cognitive_health:
            keyword_counts["Cognitive Health"] += page_source.count(keyword.lower())

        for keyword in keywords_probiotics:
            keyword_counts["Probiotics"] += page_source.count(keyword.lower())

        for keyword in keywords_fortification:
            keyword_counts["Fortification"] += page_source.count(keyword.lower())

        classification = {
            "F and B": keyword_counts["F and B"] > 0,
            "Brand": keyword_counts["Brand"] > 0,
            "Manufacturer": keyword_counts["Manufacturer"] > 0,
            "Distributor": keyword_counts["Distributor"] > 0,
            "Women's Health": keyword_counts["Women's Health"] > 0,
            "Gut Health": keyword_counts["Gut Health"] > 0,
            "Cognitive Health": keyword_counts["Cognitive Health"] > 0,
            "Probiotics": keyword_counts["Probiotics"] > 0,
            "Fortification": keyword_counts["Fortification"] > 0,
            "Relevant": sum(keyword_counts.values()) > 0
        }

        results_df.loc[len(results_df)] = [
            url, classification["F and B"], classification["Brand"], classification["Manufacturer"], classification["Distributor"], 
            classification["Women's Health"], classification["Gut Health"], classification["Cognitive Health"], 
            classification["Probiotics"], classification["Fortification"], classification["Relevant"]
        ]
    except Exception as e:
        print(f"Error processing {url}: {e}")

for website in formatted_websites:
    classify_website(website)

# Close the WebDriver
driver.quit()

# Save the results DataFrame to a CSV file
results_df.to_csv('Data_companies.csv', index=False)