In [6]:
!pip install --upgrade pyarrow


Defaulting to user installation because normal site-packages is not writeable


You should consider upgrading via the 'C:\Program Files\Python39\python.exe -m pip install --upgrade pip' command.


In [2]:
import sys, pyarrow, pandas
print("Python:", sys.executable)
print("pyarrow:", pyarrow.__version__)
print("pandas:", pandas.__version__)


Python: c:\Program Files\Python39\python.exe
pyarrow: 21.0.0
pandas: 2.2.3


In [3]:
import pandas as pd
import numpy as np
from emm import PandasEntityMatching

# Load datasets
maryland_df = pd.read_excel("G:/CLIMATE RISK & ESG/PROGETTI/Reputation_risk/REPUTATION/CODE/firms_maryland_2014_2025.xlsx")
names_df = pd.read_excel("G:/CLIMATE RISK & ESG/PROGETTI/Reputation_risk/REPUTATION/CODE/firms_for_matching_50k.xlsx")

# Ensure 'Name' columns are strings to avoid any non-string errors
maryland_df['Name'] = maryland_df['Name'].astype(str)
names_df['Name'] = names_df['Name'].astype(str)

# Add an 'Index' column if it does not exist (unique identifier for each row)
maryland_df['Index'] = maryland_df.index
names_df['Index'] = names_df.index

# Define indexers for candidate generation
indexers = [
    {
        'type': 'cosine_similarity',
        'tokenizer': 'characters',
        'ngram': 2,
        'num_candidates': 5,
        'cos_sim_lower_bound': 0.2,
    },
    {'type': 'sni', 'window_length': 3}  # SNI with a window size of 3
]

# Define entity matching parameters
em_params = {
    'name_only': True,
    'entity_id_col': 'Index',    # Unique identifier column
    'name_col': 'Name',          # Column containing names
    'indexers': indexers,
    'supervised_on': False,
    'with_legal_entity_forms_match': True,
}

# Initialize entity matcher
p = PandasEntityMatching(em_params)

# Fit model using names_df as ground truth (reference dataset)
p.fit(names_df)

# Optional: Train classifier for more accurate matching (not mandatory)
p.fit_classifier(names_df, create_negative_sample_fraction=0.5)

# Progress counter setup
total_rows = len(maryland_df)
progress_threshold = 1  # Threshold to update progress only every 1%
next_progress = progress_threshold  # Start with 1%

# Transform maryland_df to generate candidate matches, scoring each name-pair candidate
candidates_scored_pd = p.transform(maryland_df)

# For each name in maryland_df, find the best match in names_df based on score
best_candidates = candidates_scored_pd[candidates_scored_pd.best_match]

# Iterate through maryland_df to simulate progress
for i, row in enumerate(maryland_df.itertuples(), 1):
    # Check progress percentage
    progress = (i / total_rows) * 100
    if progress >= next_progress:
        print(f"{int(progress)}% names checked")
        next_progress += progress_threshold  # Update for next threshold

# Merge matched results with original columns from both dataframes for contextual information
final_results = best_candidates.merge(
    maryland_df, left_on='uid', right_on='Index', suffixes=('_matched', '_noisy')
).merge(
    names_df, left_on='gt_uid', right_on='Index', suffixes=('', '_reference')
)

# Select specific columns for the final output (customize as needed)
output_columns = [
    'uid',
    'name',
    'gt_uid',
    'gt_name',        # Name matched in names_df
    'nm_score',          # Matching score
    'rank_1',        # Original name from maryland_df (noisy)
    'score_0',
    'score_1',    # Original name from names_df (reference)
    'ISIN Code'          # ISIN or other relevant columns from names_df
]
final_results = final_results[output_columns]

# Save final results to Excel
final_results.to_excel("G:/CLIMATE RISK & ESG/PROGETTI/Reputation_risk/REPUTATION/CODE/matched_names_results_emm_num_candidates_2014_2025.xlsx", index=False)

# Optional: Inspect first few results
print(final_results.head())


1% names checked
2% names checked
3% names checked
4% names checked
5% names checked
6% names checked
7% names checked
8% names checked
9% names checked
10% names checked
11% names checked
12% names checked
13% names checked
14% names checked
15% names checked
16% names checked
17% names checked
18% names checked
19% names checked
20% names checked
21% names checked
22% names checked
23% names checked
24% names checked
25% names checked
26% names checked
27% names checked
28% names checked
29% names checked
30% names checked
31% names checked
32% names checked
33% names checked
34% names checked
35% names checked
36% names checked
37% names checked
38% names checked
39% names checked
40% names checked
41% names checked
42% names checked
43% names checked
44% names checked
45% names checked
46% names checked
47% names checked
48% names checked
49% names checked
50% names checked
51% names checked
52% names checked
53% names checked
54% names checked
55% names checked
56% names checked
5