In [5]:
import pandas as pd

# Load your final ESG dataset
df = pd.read_csv("final_esg_dataset_labeled.csv")


In [7]:
# Define structured AI Insight generator
def generate_structured_insight(row):
    score = row['Total_ESG_Risk_Score']
    exposure = row['ESG_Risk_Exposure']
    mgmt = row['ESG_Risk_Management']
    controversy = row['Controversy_Level']
    sector = row['Sector']
    industry = row['Industry']
    level = row['ESG_Risk_Label']
    predicted = row['Predicted_ESG_Score']

    trend = "improve" if predicted < score else "decline"

    insight = (
        f"The company operates in the {sector} sector ({industry}). "
        f"It has a **{level} ESG risk level**, with a current risk score of {score}. "
        f"Management is rated as '{mgmt}' with '{exposure}' exposure. "
        f"Controversy level is {controversy}, and ESG performance is expected to {trend}."
    )
    return insight



In [9]:
# Generate insights for entire dataset
df['AI_ESG_Insight'] = df.apply(generate_structured_insight, axis=1)


# AI insight generation with distilgpt2

In [45]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

# Load data
df = pd.read_csv("final_esg_dataset_with_structured_insights.csv")
df["Description"] = df["Description"].fillna("No description available.")

# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilgpt2")

# Set padding token for GPT-2
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Insight generation function
def generate_esg_insight(description, risk_level):
    prompt = f"ESG Risk Level: {risk_level}. Description: {description}. Insight:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)
    
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=40,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text.split("Insight:")[-1].strip() if "Insight:" in generated_text else generated_text.strip()

# Add ESG_AI_Insight column using progress bar
tqdm.pandas(desc="🔍 Generating ESG AI Insights")
df["ESG_AI_Insight"] = df.progress_apply(lambda row: generate_esg_insight(row["Description"], row["ESG_Risk_Level"]), axis=1)



🔍 Generating ESG AI Insights: 100%|█████████████████████████████████████████████████| 903/903 [29:53<00:00,  1.99s/it]


In [48]:
# Save to CSV
df.to_csv("final_esg_dataset_with_gpt2_insights.csv", index=False)
print("✅ ESG insights generated and saved with progress bar.")

✅ ESG insights generated and saved with progress bar.


# Sentiment Analysis of Description and Controversy 

In [62]:
from textblob import TextBlob

df['Description_Sentiment'] = df['Description'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['Description_Sentiment_Label'] = df['Description_Sentiment'].apply(
    lambda x: 'Positive' if x > 0.1 else ('Negative' if x < -0.1 else 'Neutral')
)


In [63]:
#controversy
# If you’ve already added controversy terms, skip this. Otherwise, ensure it's included.
# Example fallback if not extracted yet:
df['Top_Controversy_Terms'] = df['Description'].apply(lambda x: ["no major controversies found"] if pd.isna(x) else [x])

# --- 🧠 NLP Insight Generator Function (Template-Based) ---
def generate_esg_insight(row):
    risk = row['ESG_Risk_Level']
    terms = row['Top_Controversy_Terms']
    
    # If string, try splitting it into a list
    if isinstance(terms, str):
        terms = [term.strip() for term in terms.split(",")]

    # Build short explanation
    if risk == 'High':
        return f"The company faces high ESG risk due to major concerns like {', '.join(terms[:2])}."
    elif risk == 'Medium':
        return f"The company faces moderate ESG risk driven by issues such as {', '.join(terms[:2])}."
    else:
        return f"The company shows low ESG risk with minor concerns like {', '.join(terms[:2])}."

# --- 🧠 Apply the Insight Generator to the DataFrame ---
df['ESG_AI_Insight'] = df.apply(generate_esg_insight, axis=1)


In [66]:
# Save the new dataset with insights
df.to_csv("final_esg_dataset_with_structured_insights.csv", index=False)
print("✅ Structured AI ESG insights saved successfully!")


✅ Structured AI ESG insights saved successfully!


In [164]:
pd.read_csv("final_esg_dataset_with_structured_insights.csv").head(2)

Unnamed: 0,Symbol,Company Name,Sector,Industry,Description,Total_ESG_Risk_Score,Predicted_ESG_Score,ESG_Risk_Exposure,ESG_Risk_Management,ESG_Risk_Level,...,Controversy_Score,ESG_Risk_Label,Sector_encoded,Industry_encoded,Controversy_Level_encoded,AI_ESG_Insight,Description_Sentiment,Description_Sentiment_Label,Top_Controversy_Terms,ESG_AI_Insight
0,EMN,Eastman Chemical Company,Basic Materials,Specialty Chemicals,Eastman Chemical Company operates as a special...,25.3,15.548391,32.795002,42.033837,Medium,...,2.0,Medium,0,99,5,The company operates in the Basic Materials se...,0.166667,Positive,"[""Eastman Chemical Company operates as a speci...",The company faces moderate ESG risk driven by ...
1,DPZ,Domino's Pizza Inc.,Consumer Cyclical,Restaurants,"Domino's Pizza, Inc., through its subsidiaries...",29.2,23.613286,62.572422,33.762561,Medium,...,2.0,Medium,2,90,5,The company operates in the Consumer Cyclical ...,-0.214286,Negative,"[""Domino's Pizza, Inc., through its subsidiari...",The company faces moderate ESG risk driven by ...


# RAKE

In [166]:
#!pip install rake-nltk

In [120]:
from rake_nltk import Rake
from textblob import TextBlob
import pandas as pd

# Initialize RAKE
r = Rake()

# Clean descriptions
df['Description'] = df['Description'].apply(lambda x: x if isinstance(x, str) and x.strip() != '' else 'No ESG description available')

# Sentiment Analysis
df['Description_Sentiment'] = df['Description'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['Description_Sentiment_Label'] = df['Description_Sentiment'].apply(
    lambda x: 'Positive' if x > 0.1 else ('Negative' if x < -0.1 else 'Neutral')
)

# Keyword Extraction using RAKE
def extract_keywords_rake(text):
    try:
        r.extract_keywords_from_text(text)
        keywords = r.get_ranked_phrases()[:3]
        return keywords if keywords else ['no major controversies found']
    except:
        return ['no major controversies found']

df['Top_Controversy_Terms'] = df['Description'].apply(extract_keywords_rake)

# AI ESG Insight Generation
def generate_esg_insight(row):
    risk = row.get('ESG_Risk_Level', 'Medium')
    terms = row.get('Top_Controversy_Terms', [])
    if not isinstance(terms, list):
        terms = [terms]
    terms = [term for term in terms if isinstance(term, str) and len(term.strip()) > 0]
    top_terms = ', '.join(terms[:2]) if terms else "no specific issues"

    if risk == 'High':
        return f"The company faces high ESG risk due to major concerns like {top_terms}."
    elif risk == 'Medium':
        return f"The company faces moderate ESG risk driven by issues such as {top_terms}."
    else:
        return f"The company shows low ESG risk with minor concerns like {top_terms}."

df['ESG_AI_Insight'] = df.apply(generate_esg_insight, axis=1)


In [122]:
df.head()  # Shows the top rows with all new columns

# OR preview only the most relevant columns
df[['Company Name', 'Description_Sentiment_Label', 'Top_Controversy_Terms', 'ESG_AI_Insight']].head(10)


Unnamed: 0,Company Name,Description_Sentiment_Label,Top_Controversy_Terms,ESG_AI_Insight
0,Eastman Chemical Company,Positive,[functional products segment offers amine deri...,The company faces moderate ESG risk driven by ...
1,Domino's Pizza Inc.,Negative,"[pepperoni stuffed cheesy breads, soft drink p...",The company faces moderate ESG risk driven by ...
2,Davita Inc.,Neutral,"[company operates kidney dialysis centers, com...",The company faces moderate ESG risk driven by ...
3,"Darden Restaurants, Inc.",Neutral,"[capital burger brand names, inc ., together, ...",The company faces moderate ESG risk driven by ...
4,Zoetis Inc.,Neutral,[company commercializes products primarily acr...,The company shows low ESG risk with minor conc...
5,"Zimmer Biomet Holdings, Inc.",Neutral,"[thoracic products comprising face, medical te...",The company faces moderate ESG risk driven by ...
6,"Yum! Brands, Inc.",Neutral,[franchises quick service restaurants worldwid...,The company faces moderate ESG risk driven by ...
7,Xylem Inc,Neutral,[control solutions segment offers smart meters...,The company shows low ESG risk with minor conc...
8,"Xcel Energy, Inc.",Neutral,"[xcel energy inc .,, leases natural gas pipeli...",The company faces moderate ESG risk driven by ...
9,Wynn Resorts Ltd,Neutral,[encore boston harbor segment operates casino ...,The company faces moderate ESG risk driven by ...


In [124]:
# Save the new dataset with insights
df.to_csv("final_esg_dataset_with_structured_insights_withsentiments.csv", index=False)
print("✅ saved successfully!")

✅ saved successfully!


# USING SPACY

In [150]:
#pip install spacy
#python -m spacy download en_core_web_sm


In [152]:
import spacy
from tqdm import tqdm

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Keyword extractor using noun chunks (phrases)
def extract_keywords_spacy(text):
    try:
        text = str(text).strip()
        if not text or text.lower() in ['nan', 'none', 'null']:
            return ['no major controversies found']

        doc = nlp(text)
        # Extract noun phrases as keywords
        keywords = list(set(chunk.text.strip() for chunk in doc.noun_chunks if len(chunk.text.strip()) > 2))
        return keywords[:3] if keywords else ['no major controversies found']

    except Exception as e:
        return [f'error: {str(e)}']


In [154]:
tqdm.pandas(desc="🧠 Extracting with spaCy")
df['Top_Controversy_Terms'] = df['Description'].progress_apply(extract_keywords_spacy)


🧠 Extracting with spaCy: 100%|██████████████████████████████████████████████████████| 903/903 [00:26<00:00, 34.13it/s]


In [172]:
# --- 🧠 ESG Insight Generator (Based on spaCy Keywords) ---
def generate_esg_insight_spacy(row):
    risk = row['ESG_Risk_Level']
    terms = row['Top_Controversy_Terms']

    # Ensure terms is a list, if it's a string convert it
    if isinstance(terms, str):
        try:
            terms = eval(terms)
        except:
            terms = [terms]

    # Fallback if empty
    if not terms or not isinstance(terms, list):
        terms = ["general ESG issues"]

    # Format controversy terms
    key_terms = ', '.join(terms[:2])  # take top 2 terms

    if risk == 'High':
        return f"The company faces high ESG risk due to major concerns like {key_terms}."
    elif risk == 'Medium':
        return f"The company faces moderate ESG risk driven by issues such as {key_terms}."
    else:
        return f"The company shows low ESG risk with minor concerns like {key_terms}."


In [174]:
df['ESG_AI_Insight_spacy'] = df.apply(generate_esg_insight_spacy, axis=1)

In [176]:
# Save the new dataset with insights
df.to_csv("final_esg_dataset_with_spacy_insights.csv", index=False)
print("✅ saved successfully!")

✅ saved successfully!


In [178]:
df.head()

Unnamed: 0,Symbol,Company Name,Sector,Industry,Description,Total_ESG_Risk_Score,Predicted_ESG_Score,ESG_Risk_Exposure,ESG_Risk_Management,ESG_Risk_Level,...,ESG_Risk_Label,Sector_encoded,Industry_encoded,Controversy_Level_encoded,AI_ESG_Insight,Description_Sentiment,Description_Sentiment_Label,Top_Controversy_Terms,ESG_AI_Insight,ESG_AI_Insight_spacy
0,EMN,Eastman Chemical Company,Basic Materials,Specialty Chemicals,Eastman Chemical Company operates as a special...,25.3,15.548391,32.795002,42.033837,Medium,...,Medium,0,99,5,The company operates in the Basic Materials se...,0.166667,Positive,"[organic acid-based solutions, polyvinyl butyr...",The company faces moderate ESG risk driven by ...,The company faces moderate ESG risk driven by ...
1,DPZ,Domino's Pizza Inc.,Consumer Cyclical,Restaurants,"Domino's Pizza, Inc., through its subsidiaries...",29.2,23.613286,62.572422,33.762561,Medium,...,Medium,2,90,5,The company operates in the Consumer Cyclical ...,-0.214286,Negative,"[three segments, Ann Arbor, Michigan]",The company faces moderate ESG risk driven by ...,The company faces moderate ESG risk driven by ...
2,DVA,Davita Inc.,Healthcare,Medical Care Facilities,DaVita Inc. provides kidney dialysis services ...,22.6,23.139539,46.293286,39.349583,Medium,...,Medium,8,63,5,The company operates in the Healthcare sector ...,0.023333,Neutral,"[acute inpatient dialysis services, kidney dia...",The company faces moderate ESG risk driven by ...,The company faces moderate ESG risk driven by ...
3,DRI,"Darden Restaurants, Inc.",Consumer Cyclical,Restaurants,"Darden Restaurants, Inc., together with its su...",27.5,18.805051,62.781265,47.253221,Medium,...,Medium,2,90,5,The company operates in the Consumer Cyclical ...,0.0,Neutral,"[Darden Restaurants, Eddie V's Prime Seafood, ...",The company faces moderate ESG risk driven by ...,The company faces moderate ESG risk driven by ...
4,ZTS,Zoetis Inc.,Healthcare,Drug Manufacturers - Specialty & Generic,"Zoetis Inc. engages in the discovery, developm...",18.8,8.03524,66.772389,35.572647,Low,...,Low,8,32,5,The company operates in the Healthcare sector ...,0.055195,Neutral,"[parasiticides, dermatology, New Jersey]",The company shows low ESG risk with minor conc...,The company shows low ESG risk with minor conc...


In [170]:
#!pip install fpdf
