<a href="https://colab.research.google.com/github/ShlokArora2709/ESG-Project/blob/main/ArticleScrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import pandas as pd
import string
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import os
from googlesearch import search
from bs4 import BeautifulSoup
import aiohttp
import asyncio
import nest_asyncio
nest_asyncio.apply()

In [None]:
df=pd.read_csv('/content/esg_data.csv')
df.head()

Unnamed: 0,Company Name,ESG Rating
0,AAC TECHNOLOGIES HOLDINGS INC.,BBB
1,APPLE INC.,BBB
2,EQT AB,AA
3,Addtech AB,AA
4,Skanska AB,A


In [None]:
esg_factors = {
    "E": [
        "Carbon footprint",
        "Climate policies",
        "Waste byproducts",
    ],
    "S": [
        "Social vulnerability",
        "Consumer protections",
        "Health and demographic risk",
    ],
    "G": [
        "Business ethics",
        "Executive compensation",
        "Pay ratios",
    ]
}

In [None]:
async def fetch(session, url):
    try:
        async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as response:
            response.raise_for_status()  # Raise an error for bad responses
            html = await response.text()
            return html
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

async def get_top_articles(company_name, factor, max_results=2):
    search_query = f"{company_name} + {factor}"
    text_data = []
    result_count = 0

    async with aiohttp.ClientSession() as session:
        for url in search(search_query, lang="en"):
            if result_count >= max_results:
                break

            # Skip PDF links
            if url.endswith(".pdf"):
                continue

            html = await fetch(session, url)
            if html:
                soup = BeautifulSoup(html, 'html.parser')
                # Extract text from all tags that contain text
                text_elements = soup.find_all(string=True)
                filtered_text = ' '.join([element.strip() for element in text_elements if element.parent.name not in ["style", "script", "head", "meta", "[document]"] and element.strip()])

                text_data.append(filtered_text + "..." if filtered_text else "No content available")
                result_count += 1

    return text_data

In [None]:
df["E"] = None
df["S"] = None
df["G"] = None

In [None]:
async def fetch_all_articles_for_company(company_name, esg_factors):
    environmental_articles = []
    social_articles = []
    governance_articles = []

    async with aiohttp.ClientSession() as session:
        for category, factors in esg_factors.items():
            for factor in factors:
                articles = await get_top_articles(company_name, factor)

                if category == "E":
                    environmental_articles.extend(articles)
                elif category == "S":
                    social_articles.extend(articles)
                elif category == "G":
                    governance_articles.extend(articles)

                print(f"Processed {company_name} for category {category}")

    return environmental_articles, social_articles, governance_articles

In [None]:
def update_articles_in_df(df, start_idx, end_idx, esg_factors):
    for idx, row in df[start_idx:end_idx].iterrows():
        company_name = row["Company Name"]

        # Fetch articles asynchronously
        articles = asyncio.run(fetch_all_articles_for_company(company_name, esg_factors))

        # Update DataFrame
        df.at[idx, "E"] = articles[0]
        df.at[idx, "S"] = articles[1]
        df.at[idx, "G"] = articles[2]


In [None]:
update_articles_in_df(df, 301, 400, esg_factors)

Processed GEM Co., Ltd. for category E
Processed GEM Co., Ltd. for category E
Processed GEM Co., Ltd. for category E
Error fetching https://www.globalquakemodel.org/gem-maps/global-earthquake-social-vulnerability-map: 500, message='Internal Server Error', url='https://www.globalquakemodel.org/gem-maps/global-earthquake-social-vulnerability-map'
Error fetching https://www.researchgate.net/publication/265915480_Social_Vulnerability_And_Integrated_Risk_Assessment_Within_The_Global_Earthquake_Model: 403, message='Forbidden', url='https://www.researchgate.net/publication/265915480_Social_Vulnerability_And_Integrated_Risk_Assessment_Within_The_Global_Earthquake_Model'
Processed GEM Co., Ltd. for category S
Processed GEM Co., Ltd. for category S
Processed GEM Co., Ltd. for category S
Processed GEM Co., Ltd. for category G
Error fetching https://www.comparably.com/companies/gem-com/executive-salaries: 403, message='Forbidden', url='https://www.comparably.com/companies/gem-com/executive-salarie



Processed IDEX CORPORATION for category E




Processed IDEX CORPORATION for category E




Processed IDEX CORPORATION for category E
Processed IDEX CORPORATION for category S
Processed IDEX CORPORATION for category S




Processed IDEX CORPORATION for category S
Processed IDEX CORPORATION for category G
Error fetching https://www.comparably.com/companies/idex-corp/executive-salaries: 403, message='Forbidden', url='https://www.comparably.com/companies/idex-corp/executive-salaries'
Error fetching https://investors.idexcorp.com/static-files/a04ff26d-99ba-4f74-a63d-8fa0d96b9cc0: 'utf-8' codec can't decode byte 0xd0 in position 0: invalid continuation byte
Error fetching https://www.sec.gov/Archives/edgar/data/832101/000083210122000016/iex-20220331xex101.htm: 403, message='Forbidden', url='https://www.sec.gov/Archives/edgar/data/832101/000083210122000016/iex-20220331xex101.htm'
Processed IDEX CORPORATION for category G
Error fetching https://www.indeed.com/cmp/Idex-Corporation/salaries: 403, message='Forbidden', url='https://www.indeed.com/cmp/Idex-Corporation/salaries'
Error fetching https://www.nasdaq.com/market-activity/stocks/iex/price-earnings-peg-ratios: 
Error fetching https://www.glassdoor.com/Salar

In [None]:
df[301:400].to_csv("ESG_w_cont4.csv")