In [7]:
from google.colab import drive
drive.mount('/content/drive')

!pip install pandas openpyxl

import pandas as pd
import os
import glob


TARGET_COMPANIES = ["CARR.PA", "WMT", "COST.O"]
TARGET_METRICS = [
    r"Toxic Chemicals Reduction",
    r"Total Renewable Energy To Energy Use in million",
    r"Biodiversity Impact Reduction",
    r"Climate Change Commercial Risks Opportunities",
    r"e-Waste Reduction"
]


def extract_metrics_with_scores(file_path):
    company = os.path.basename(file_path).split(" for ")[-1].split(".xlsx")[0]
    if company not in TARGET_COMPANIES:
        return pd.DataFrame()

    all_sheets = pd.read_excel(file_path, sheet_name=None, header=None)
    metrics_data = pd.DataFrame()

    for sheet_name, df_sheet in all_sheets.items():
        years = df_sheet.iloc[5, 2::2].values
        years = [pd.to_datetime(y).year if pd.notna(y) else None for y in years]

        for metric in TARGET_METRICS:
            target_rows = df_sheet[df_sheet[1].str.contains(metric, na=False, regex=True)]
            if not target_rows.empty:
                for _, row in target_rows.iterrows():
                    if "Total Renewable Energy To Energy Use" in metric:
                        scores = row.iloc[4::2].values
                    else:
                        scores = row.iloc[3::2].values

                    year_score_pairs = []
                    for year, score in zip(years, scores):
                        try:
                            num_score = float(score)
                            if year:
                                year_score_pairs.append((year, num_score))
                        except:
                            continue

                    if year_score_pairs:
                        latest_year = max(y for y, s in year_score_pairs)
                        latest_score = [s for y, s in year_score_pairs if y == latest_year][0]
                        temp_df = pd.DataFrame({
                            'Metric': [metric],
                            'Company': company,
                            'Value': [latest_score],
                            'Year': [latest_year]
                        })
                        metrics_data = pd.concat([metrics_data, temp_df], ignore_index=True)
    return metrics_data


folder_path = '/content/drive/MyDrive/data_clean_ESG_python/The_raw_data/'
file_pattern = os.path.join(folder_path, 'ESG Table for *.xlsx')
file_paths = [f for f in glob.glob(file_pattern)
              if any(comp in os.path.basename(f) for comp in TARGET_COMPANIES)]

print("connected files:", file_paths)

combined_data = pd.DataFrame()
for file_path in file_paths:
    print(f"Processing: {file_path}")
    df = extract_metrics_with_scores(file_path)
    combined_data = pd.concat([combined_data, df], ignore_index=True)

print("\n_cleaned_companies:", combined_data['Company'].unique())

if not combined_data.empty:
    pivot_table = combined_data.pivot_table(
        index='Metric',
        columns='Company',
        values='Value',
        aggfunc='first'
    )
    pivot_table.reset_index(inplace=True)
    pivot_table.columns.name = None
    output_path = '/content/drive/MyDrive/Sustainability_Metrics_Scores.xlsx'
    pivot_table.to_excel(output_path, index=False)
    print("finished_saved:", output_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
匹配到的文件列表: ['/content/drive/MyDrive/data_clean_ESG_python/The_raw_data/ESG Table for COST.O.xlsx', '/content/drive/MyDrive/data_clean_ESG_python/The_raw_data/ESG Table for CARR.PA.xlsx', '/content/drive/MyDrive/data_clean_ESG_python/The_raw_data/ESG Table for WMT.xlsx']
Processing: /content/drive/MyDrive/data_clean_ESG_python/The_raw_data/ESG Table for COST.O.xlsx
Processing: /content/drive/MyDrive/data_clean_ESG_python/The_raw_data/ESG Table for CARR.PA.xlsx
Processing: /content/drive/MyDrive/data_clean_ESG_python/The_raw_data/ESG Table for WMT.xlsx

已处理公司列表: ['COST.O' 'CARR.PA' 'WMT']
处理完成！结果已保存至: /content/drive/MyDrive/Sustainability_Metrics_Scores.xlsx
