In [7]:
from google.colab import drive
drive.mount('/content/drive')

!pip install pandas openpyxl

import pandas as pd
import os
import glob

def extract_environmental_scores(file_path):

    company = os.path.basename(file_path).split(" for ")[-1].split(".xlsx")[0]
    all_sheets = pd.read_excel(file_path, sheet_name=None, header=None)
    all_data = pd.DataFrame()


    target_patterns = [
        "Environmental Pillar Score \(Weight 30\.0%\)",
        "Environmental Pillar Score \(Weight 23\.6%\)"
    ]

    for sheet_name, df_sheet in all_sheets.items():

        combined_pattern = "|".join(target_patterns)
        target_rows = df_sheet[df_sheet[1].str.contains(combined_pattern, na=False, regex=True)]

        if not target_rows.empty:
            for _, target_row in target_rows.iterrows():

                years = df_sheet.iloc[5, 2::2].values
                values = target_row.iloc[3::2].values

                weight_category = target_row[1].split("(")[-1].split(")")[0].strip()

                temp_df = pd.DataFrame({
                    'Year': years,
                    'Sheet': sheet_name,
                    'Environmental_Score_Value': values,
                    'Weight_Category': weight_category,
                    'Company': company
                })
                all_data = pd.concat([all_data, temp_df], ignore_index=True)


    all_data['Environmental_Score_Value'] = pd.to_numeric(all_data['Environmental_Score_Value'], errors='coerce')
    all_data.dropna(subset=['Environmental_Score_Value'], inplace=True)

    return all_data


folder_path = '/content/drive/MyDrive/data_clean_ESG_python/The_raw_data/'
file_pattern = os.path.join(folder_path, 'ESG Table for *.xlsx')
file_paths = glob.glob(file_pattern)


combined_data = pd.DataFrame()
for file_path in file_paths:
    print(f"Processing: {file_path}")
    df = extract_environmental_scores(file_path)
    combined_data = pd.concat([combined_data, df], ignore_index=True)

combined_data.sort_values(by=['Company', 'Year'], inplace=True)


output_path = '/content/drive/MyDrive/Combined_Environmental_Scores.xlsx'
combined_data.to_excel(output_path, index=False)
print(f"Combined environmental scores saved to {output_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Processing: /content/drive/MyDrive/data_clean_ESG_python/The_raw_data/ESG Table for WOW.AX.xlsx
Processing: /content/drive/MyDrive/data_clean_ESG_python/The_raw_data/ESG Table for AD.AS(Ahold Delhaize).xlsx
Processing: /content/drive/MyDrive/data_clean_ESG_python/The_raw_data/ESG Table for SBRY(sainsbury).L.xlsx
Processing: /content/drive/MyDrive/data_clean_ESG_python/The_raw_data/ESG Table for COL.AX(coles).xlsx
Processing: /content/drive/MyDrive/data_clean_ESG_python/The_raw_data/ESG Table for COST.O.xlsx
Processing: /content/drive/MyDrive/data_clean_ESG_python/The_raw_data/ESG Table for L.TO(loblaw.xlsx
Processing: /content/drive/MyDrive/data_clean_ESG_python/The_raw_data/ESG Table for TSCO(Tesco).L.xlsx
Processing: /content/drive/MyDrive/data_clean_ESG_python/The_raw_data/ESG Table for CARR.PA.xlsx
Processing: /content/drive/MyDrive/data_clean_ESG_python/

In [14]:
import pandas as pd
from google.colab import drive


drive.mount('/content/drive')


file_path = '/content/drive/MyDrive/Combined_Environmental_Scores.xlsx - Sheet1.csv'
df = pd.read_csv(file_path)


print("原始列名:", df.columns)


df['Year'] = pd.to_datetime(df['Year']).dt.year


df_cleaned = df.drop_duplicates(subset=['Year', 'Company']).rename(
    columns={'Environmental_Score_Value': 'E_Score_Value'}
)


print("cleaned:", df_cleaned.columns)


pivot_table = df_cleaned.pivot_table(
    index='Year',
    columns='Company',
    values='E_Score_Value',
    aggfunc='first'
)


pivot_table.reset_index(inplace=True)
pivot_table.columns.name = None
pivot_table.sort_values(by='Year', inplace=True)

output_path = '/content/drive/MyDrive/E_Score_Yearly_Table.xlsx'
pivot_table.to_excel(output_path, index=False)
print(f"saved: {output_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
原始列名: Index(['Year', 'Sheet', 'Environmental_Score_Value', 'Weight_Category',
       'Company'],
      dtype='object')
清理后列名: Index(['Year', 'Sheet', 'E_Score_Value', 'Weight_Category', 'Company'], dtype='object')
结果已保存至: /content/drive/MyDrive/E_Score_Yearly_Table.xlsx
