# Probiotic Data Miner: Insights into F&B and Healthcare Giants - 

# [ Company_DeepThought ]

- This project demonstrates web scraping and analysis of company websites to extract valuable insights into the probiotics market. 

- The goal is to identify trends, collaborations and innovations across F&B, pharmaceutical and healthcare companies.


In [None]:
#  **Project Overview**

#  **Objective** :
    
# To analyze data from company websites to uncover:

- Product offerings related to probiotics.
- Company profiles, partnerships, and collaborations.
- Research innovations and marketing efforts in the health segments.

In [35]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [37]:
# Function to scrape website data

In [75]:
def scrape_website(url, company_name):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract general information
        title = soup.title.string if soup.title else "No Title"
        content = soup.get_text().lower()

        # Keywords for analysis
        product_keywords = ['product', 'ingredients', 'supplement', 'probiotic', 'fortification']
        profile_keywords = ['about us', 'vision', 'mission', 'history']
        partnership_keywords = ['partner', 'collaboration', 'supplier', 'distributor']
        research_keywords = ['research', 'study', 'innovation', 'patent', 'scientific']
        marketing_keywords = ['campaign', 'testimonial', 'customer', 'social media']
        
        # Information extraction
        product_info = any(keyword in content for keyword in product_keywords)
        company_profile = any(keyword in content for keyword in profile_keywords)
        partnerships = any(keyword in content for keyword in partnership_keywords)
        research_innovations = any(keyword in content for keyword in research_keywords)
        marketing_promotions = any(keyword in content for keyword in marketing_keywords)

        # Categorization
        category = 'Relevant' if product_info or company_profile else 'Not Relevant'
        
        return {
            'Company': company_name,
            'Website': url,
            'Title': title,
            'Category': category,
            'Product Info': "Yes" if product_info else "No",
            'Company Profile': "Yes" if company_profile else "No",
            'Partnerships': "Yes" if partnerships else "No",
            'Research & Innovations': "Yes" if research_innovations else "No",
            'Marketing & Promotions': "Yes" if marketing_promotions else "No"
        }
    except Exception as e:
        return {'Company': company_name, 'Website': url, 'Error': str(e)}

# List of companies and their websites

In [77]:
companies = [
    {"name": "Nestle", "website": "https://www.nestle.com"},
    {"name": "Dr. Reddy's Laboratories", "website": "https://www.drreddys.com"},
    {"name": "Coca-Cola", "website": "https://www.coca-colacompany.com"},
    {"name": "Pfizer", "website": "https://www.pfizer.com"},
    {"name": "PepsiCo", "website": "https://www.pepsico.com"},
    {"name": "Johnson & Johnson", "website": "https://www.jnj.com"},
    {"name": "Danone", "website": "https://www.danone.com"},
    {"name": "Bayer", "website": "https://www.bayer.com"},
    {"name": "General Mills", "website": "https://www.generalmills.com"},
    {"name": "GlaxoSmithKline (GSK)", "website": "https://www.gsk.com"},
    {"name": "Kellogg’s", "website": "https://www.kelloggs.com/en_US/home.html"},
    {"name": "Merck & Co.", "website": "https://www.merck.com"},
    {"name": "Unilever", "website": "https://www.unilever.com"},
    {"name": "Roche", "website": "https://www.roche.com"},
    {"name": "Nestle Waters", "website": "https://www.nestlewaters.com"},
    {"name": "Sanofi", "website": "https://www.sanofi.com"},
    {"name": "Mondelez International", "website": "https://www.mondelezinternational.com"},
    {"name": "Novartis", "website": "https://www.novartis.com"},
    {"name": "Kraft Heinz", "website": "https://www.kraftheinzcompany.com"},
    {"name": "Eli Lilly and Company", "website": "https://www.lilly.com"},
    {"name": "Tyson Foods", "website": "https://www.tysonfoods.com"},
    {"name": "Teva Pharmaceuticals", "website": "https://www.tevapharm.com"},
    {"name": "Mars, Incorporated", "website": "https://www.mars.com"},
    {"name": "AbbVie", "website": "https://www.abbvie.com"},
    {"name": "Campbell Soup Company", "website": "https://www.campbellsoupcompany.com"},
    {"name": "Amgen", "website": "https://www.amgen.com"},
    {"name": "Conagra Brands", "website": "https://www.conagrabrands.com"},
    {"name": "AstraZeneca", "website": "https://www.astrazeneca.com"},
    {"name": "Molson Coors", "website": "https://www.molsoncoors.com"},
    {"name": "Boehringer Ingelheim", "website": "https://www.boehringer-ingelheim.com"},
    {"name": "AB InBev", "website": "https://www.abinbev.com"},
    {"name": "BASF", "website": "https://www.basf.com"},
    {"name": "Diageo", "website": "https://www.diageo.com"},
    {"name": "Procter & Gamble (P&G)", "website": "https://www.pg.com"},
    {"name": "Heineken", "website": "https://www.theheinekencompany.com"},
    {"name": "Medtronic", "website": "https://www.medtronic.com"},
    {"name": "McKesson",  "website": "https://www.mckesson.com"},
    {"name":"AmerisourceBergen","website": "https://www.amerisourcebergen.com"}, 
    {"name":"Cardinal Health","website": "https://www.cardinalhealth.com/en.html"}, 
    {"name":"Medline Industries","website": "https://www.medline.com"} 
]

# Scrape data for each company

In [79]:
scraped_data = [scrape_website(company['website'], company['name']) for company in companies]

In [82]:
pd.DataFrame(scraped_data)

Unnamed: 0,Company,Website,Title,Category,Product Info,Company Profile,Partnerships,Research & Innovations,Marketing & Promotions,Error
0,Nestle,https://www.nestle.com,Just a moment...,Not Relevant,No,No,No,No,No,
1,Dr. Reddy's Laboratories,https://www.drreddys.com,Dr. Reddy’s Laboratories – Good Health Can’t Wait,Not Relevant,No,No,No,No,No,
2,Coca-Cola,https://www.coca-colacompany.com,\n The Coca-Cola Company: Refresh the W...,Relevant,Yes,Yes,Yes,Yes,Yes,
3,Pfizer,https://www.pfizer.com,Pfizer: One of the world's premier biopharmace...,Relevant,Yes,Yes,Yes,Yes,No,
4,PepsiCo,https://www.pepsico.com,No Title,Not Relevant,No,No,No,No,No,
5,Johnson & Johnson,https://www.jnj.com,Johnson & Johnson: Changing health for humanity,Relevant,Yes,Yes,Yes,Yes,Yes,
6,Danone,https://www.danone.com,World food company - Danone,Relevant,Yes,Yes,Yes,Yes,Yes,
7,Bayer,https://www.bayer.com,Site Maintenance,Not Relevant,No,No,No,No,No,
8,General Mills,https://www.generalmills.com,General Mills: A U.S. based food company. - Ge...,Relevant,Yes,Yes,Yes,No,No,
9,GlaxoSmithKline (GSK),https://www.gsk.com,Home | GSK,Relevant,Yes,Yes,Yes,Yes,Yes,


# Saving results to a DataFrame (Df)

In [85]:
df = pd.DataFrame(scraped_data)

In [87]:
df

Unnamed: 0,Company,Website,Title,Category,Product Info,Company Profile,Partnerships,Research & Innovations,Marketing & Promotions,Error
0,Nestle,https://www.nestle.com,Just a moment...,Not Relevant,No,No,No,No,No,
1,Dr. Reddy's Laboratories,https://www.drreddys.com,Dr. Reddy’s Laboratories – Good Health Can’t Wait,Not Relevant,No,No,No,No,No,
2,Coca-Cola,https://www.coca-colacompany.com,\n The Coca-Cola Company: Refresh the W...,Relevant,Yes,Yes,Yes,Yes,Yes,
3,Pfizer,https://www.pfizer.com,Pfizer: One of the world's premier biopharmace...,Relevant,Yes,Yes,Yes,Yes,No,
4,PepsiCo,https://www.pepsico.com,No Title,Not Relevant,No,No,No,No,No,
5,Johnson & Johnson,https://www.jnj.com,Johnson & Johnson: Changing health for humanity,Relevant,Yes,Yes,Yes,Yes,Yes,
6,Danone,https://www.danone.com,World food company - Danone,Relevant,Yes,Yes,Yes,Yes,Yes,
7,Bayer,https://www.bayer.com,Site Maintenance,Not Relevant,No,No,No,No,No,
8,General Mills,https://www.generalmills.com,General Mills: A U.S. based food company. - Ge...,Relevant,Yes,Yes,Yes,No,No,
9,GlaxoSmithKline (GSK),https://www.gsk.com,Home | GSK,Relevant,Yes,Yes,Yes,Yes,Yes,


In [89]:
df.head()

Unnamed: 0,Company,Website,Title,Category,Product Info,Company Profile,Partnerships,Research & Innovations,Marketing & Promotions,Error
0,Nestle,https://www.nestle.com,Just a moment...,Not Relevant,No,No,No,No,No,
1,Dr. Reddy's Laboratories,https://www.drreddys.com,Dr. Reddy’s Laboratories – Good Health Can’t Wait,Not Relevant,No,No,No,No,No,
2,Coca-Cola,https://www.coca-colacompany.com,\n The Coca-Cola Company: Refresh the W...,Relevant,Yes,Yes,Yes,Yes,Yes,
3,Pfizer,https://www.pfizer.com,Pfizer: One of the world's premier biopharmace...,Relevant,Yes,Yes,Yes,Yes,No,
4,PepsiCo,https://www.pepsico.com,No Title,Not Relevant,No,No,No,No,No,


In [91]:
df.tail()

Unnamed: 0,Company,Website,Title,Category,Product Info,Company Profile,Partnerships,Research & Innovations,Marketing & Promotions,Error
35,Medtronic,https://www.medtronic.com,Home | Medtronic,Relevant,Yes,Yes,Yes,Yes,Yes,
36,McKesson,https://www.mckesson.com,"McKesson | Medical Supplies, Pharmaceuticals H...",Relevant,Yes,Yes,Yes,Yes,Yes,
37,AmerisourceBergen,https://www.amerisourcebergen.com,AmerisourceBergen | United in our responsibili...,Relevant,Yes,Yes,Yes,Yes,No,
38,Cardinal Health,https://www.cardinalhealth.com/en.html,,,,,,,,"('Connection aborted.', RemoteDisconnected('Re..."
39,Medline Industries,https://www.medline.com,"Medline.com | Medline Industries, Inc.",Relevant,Yes,No,No,No,No,


# Save results to an Excel file

- Specified the full path for the output file below is the path

In [93]:
output_file = r"C:\Users\DELL\Music\Web Scraping F&B\company_analysis_extended.xlsx"

# Saved the DataFrame Final ouctome to the specified location

In [95]:
df.to_excel(output_file, index=False)

print(f"Analysis complete! Results saved to '{output_file}'")

Analysis complete! Results saved to 'C:\Users\DELL\Music\Web Scraping F&B\company_analysis_extended.xlsx'
