In [34]:
# Install fuzzywuzzy if not already installed
%pip install fuzzywuzzy

from fuzzywuzzy import fuzz
import pandas as pd

def map_esg_features_fuzzy(combined_path, esg_excel_path, threshold=85):
    combined_df = pd.read_csv(combined_path)
    esg_sheets = pd.read_excel(esg_excel_path, sheet_name=None)
    feature_category_map = {}

    # Lowercase version of combined dataset columns for better comparison
    combined_columns = {col.lower(): col for col in combined_df.columns}

    for sheet_name, df in esg_sheets.items():
        for col_index in range(min(5, df.shape[1])):  # Check first few columns
            possible_features = df.iloc[10:, col_index].dropna().astype(str)

            for feature in possible_features:
                feature_lower = feature.strip().lower()
                for col_lower, original_col in combined_columns.items():
                    if fuzz.ratio(feature_lower, col_lower) >= threshold:
                        feature_category_map[original_col] = sheet_name

    return feature_category_map


Note: you may need to restart the kernel to use updated packages.


In [35]:
from fuzzywuzzy import fuzz
combined_csv_path = "/Users/tobifadeyi/Documents/Thesis code/combined_data_with_unique_years.csv"
esg_excel_path = '/Users/tobifadeyi/Downloads/ESG/ESG TABLES/ESG Table for AAF.L.xlsx'

feature_mapping = map_esg_features_fuzzy(combined_csv_path, esg_excel_path)
print(feature_mapping)


{'ESG Combined Score': 'Controversies', 'ESG Combined Score\n.1': 'Controversies', 'ESG Combined Score Grade': 'Controversies', 'ESG Controversies Score': 'Controversies', 'Resource Reduction Policy Score': 'Environment', 'Policy Water Efficiency Score': 'Environment', 'Policy Energy Efficiency Score': 'Environment', 'Policy Sustainable Packaging Score': 'Environment', 'Policy Environmental Supply Chain Score': 'Environment', 'Resource Reduction Targets Score': 'Environment', 'Targets Water Efficiency Score': 'Environment', 'Targets Energy Efficiency Score': 'Environment', 'Environment Management Team Score': 'Environment', 'Environmental Materials Sourcing Score': 'Environment', 'Toxic Chemicals Reduction Score': 'Environment', 'Renewable Energy Use Ratio Score': 'Environment', 'Renewable Energy Supply Score': 'Environment', 'Total Renewable Energy To Energy Use in million Score': 'Environment', 'Cement Energy Use Score': 'Environment', 'Environmental Supply Chain Management Score': '

In [36]:
feature_mapping

{'ESG Combined Score': 'Controversies',
 'ESG Combined Score\n.1': 'Controversies',
 'ESG Combined Score Grade': 'Controversies',
 'ESG Controversies Score': 'Controversies',
 'Resource Reduction Policy Score': 'Environment',
 'Policy Water Efficiency Score': 'Environment',
 'Policy Energy Efficiency Score': 'Environment',
 'Policy Sustainable Packaging Score': 'Environment',
 'Policy Environmental Supply Chain Score': 'Environment',
 'Resource Reduction Targets Score': 'Environment',
 'Targets Water Efficiency Score': 'Environment',
 'Targets Energy Efficiency Score': 'Environment',
 'Environment Management Team Score': 'Environment',
 'Environmental Materials Sourcing Score': 'Environment',
 'Toxic Chemicals Reduction Score': 'Environment',
 'Renewable Energy Use Ratio Score': 'Environment',
 'Renewable Energy Supply Score': 'Environment',
 'Total Renewable Energy To Energy Use in million Score': 'Environment',
 'Cement Energy Use Score': 'Environment',
 'Environmental Supply Chain M

In [37]:
environment_keys = [key for key, value in feature_mapping.items() if value == "Environment"]


In [38]:
len(environment_keys)

61

In [39]:
controversies_keys = [key for key, value in feature_mapping.items() if value == "Controversies"]
len(controversies_keys)

14

In [40]:
controversies_keys

['ESG Combined Score',
 'ESG Combined Score\n.1',
 'ESG Combined Score Grade',
 'ESG Controversies Score',
 'Environmental Controversies Score',
 'Wages Working Condition Controversies Score',
 'Anti-competition Controversies Score',
 'Bribery, Corruption and Fraud Controversies Score',
 'Consumer Complaints Controversies Score',
 'Product Quality Controversies Score',
 'Responsible Marketing Controversies Score',
 'Executive Compensation Controversies Score',
 'Insider Dealings Controversies Score',
 'Accounting Controversies Score']

In [41]:
governance_keys = [key for key, value in feature_mapping.items() if value == "Governance"]
len(governance_keys)

57

In [42]:
social_keys = [key for key, value in feature_mapping.items() if value == "Social"]
len(social_keys)

58

In [43]:
social_keys

['Health & Safety Policy Score',
 'Training and Development Policy Score',
 'Policy Diversity and Opportunity Score',
 'Targets Diversity and Opportunity Score',
 'Employees Health & Safety Team Score',
 'Supply Chain Health & Safety Improvements Score',
 'Employees Health & Safety OHSAS 18001 Score',
 'Employee Satisfaction Score',
 'Net Employment Creation Score',
 'Trade Union Representation Score',
 'Turnover of Employees Score',
 'Announced Layoffs To Total Employees Score',
 'Gender Pay Gap Percentage Score',
 'HRC Corporate Equality Index Score',
 'Flexible Working Hours Score',
 'Day Care Services Score',
 'Employees With Disabilities Score',
 'Employee Health & Safety Training Hours Score',
 'Injuries To Million Hours Score',
 'Occupational Diseases Score',
 'Average Training Hours Score',
 'Training Costs Per Employee Score',
 'Internal Promotion Score',
 'Supplier ESG training Score',
 'Employee Resource Groups Score',
 'Human Rights Policy Score',
 'Policy Freedom of Associ

In [44]:
print('social_keys:', social_keys)
print('controversies_keys:', controversies_keys)
print('governance_keys:', governance_keys)
print('environment_keys:', environment_keys)

social_keys: ['Health & Safety Policy Score', 'Training and Development Policy Score', 'Policy Diversity and Opportunity Score', 'Targets Diversity and Opportunity Score', 'Employees Health & Safety Team Score', 'Supply Chain Health & Safety Improvements Score', 'Employees Health & Safety OHSAS 18001 Score', 'Employee Satisfaction Score', 'Net Employment Creation Score', 'Trade Union Representation Score', 'Turnover of Employees Score', 'Announced Layoffs To Total Employees Score', 'Gender Pay Gap Percentage Score', 'HRC Corporate Equality Index Score', 'Flexible Working Hours Score', 'Day Care Services Score', 'Employees With Disabilities Score', 'Employee Health & Safety Training Hours Score', 'Injuries To Million Hours Score', 'Occupational Diseases Score', 'Average Training Hours Score', 'Training Costs Per Employee Score', 'Internal Promotion Score', 'Supplier ESG training Score', 'Employee Resource Groups Score', 'Human Rights Policy Score', 'Policy Freedom of Association Score',