In [1]:
# 04_AI_powered_insight_generator.ipynb

# Step 1: Install necessary packages if not already
!pip install transformers pandas tqdm --quiet



In [3]:
# Step 2: Import libraries
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from tqdm import tqdm

In [5]:
# Step 3: Load dataset
df = pd.read_csv("final_esg_dataset_labeled.csv")  # or your cleaned dataset path

In [7]:
# Step 4: Load GPT2-Medium model and tokenizer
model_name = "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.eval()

# Add padding token for safe generation
tokenizer.pad_token = tokenizer.eos_token


In [9]:
# Step 5: Define a better prompt template
def generate_prompt(row):
    return f"""Company: {row.get('Name', row.get('company', 'Unknown'))}
Sector: {row['Sector']}
Industry: {row['Industry']}
ESG Risk Score: {row['Total_ESG_Risk_Score']}
Controversy Level: {row['Controversy_Level']}
Risk Management: {row.get('esg_risk_management', 'Unknown')}

Give a short ESG insight:"""


In [11]:
# Step 6: Define the generator function
def generate_gpt_insight(row):
    prompt = generate_prompt(row)
    input_ids = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=200)

    output = model.generate(
        input_ids,
        max_length=60,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95
    )

    generated = tokenizer.decode(output[0], skip_special_tokens=True)
    generated = generated.split("Give a short ESG insight:")[-1].strip().split("\n")[0].rstrip('.')
    return generated + "."


In [13]:
# Step 7: Apply to all rows with progress bar
tqdm.pandas()
df['AI_ESG_Insight'] = df.progress_apply(generate_gpt_insight, axis=1)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|▏                                                                               | 2/903 [00:10<1:21:22,  5.42s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|▎                                                                               | 3/903 [00:15<1:17:11,  5.15s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|▎   

In [17]:
# Step 8: Save updated dataset
df.to_csv("final_esg_dataset_labeled_with_ai_insightsgpt2medium.csv", index=False)
print("✅ AI ESG insights added and saved to 'final_esg_dataset_labeled_with_ai_insightsgpt2medium.csv'")


✅ AI ESG insights added and saved to 'final_esg_dataset_labeled_with_ai_insightsgpt2medium.csv'
