In [2]:
%pip install rapidfuzz
%pip install pandas

import pandas as pd
from rapidfuzz import process, fuzz
import re

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
companies_data = pd.read_csv("../data/companies.csv")
articles_data = pd.read_csv("../data/articles_about_breaches_with_company_name.csv")

## Exact string match

In [2]:
companies_data['name_lower'] = companies_data['name'].str.lower()
articles_data['CompanyMentioned_lower'] = articles_data['CompanyMentioned'].str.lower()

# Perform the join on the lowercase columns
result_df = pd.merge(companies_data, articles_data, left_on='name_lower', right_on='CompanyMentioned_lower', how='inner')

# Drop the helper lowercase columns if desired
result_df = result_df.drop(columns=['name_lower', 'CompanyMentioned_lower'])

result_df.to_csv("../data/joined_articles_companies.csv", index=False)

## Fuzzy matching

Problem with this is that it matches articles to multiple companies when they should only be matched to one. My best idea right now is to go through them by hand to determine which is should be matched to, but I am hesitant to do that right now because we may rerun classfier and such so would rather only do the manual work on the data we are going to use for sure.

In [None]:
companies_data_aux = companies_data
articles_data_aux = articles_data

def preprocess(text):
    # Remove punctuation and convert to lowercase
    if pd.isna(text):
        return ""
    return re.sub(r'[^\w\s]', '', text).lower()

# Preprocess your data
companies_data_aux['name_processed'] = companies_data_aux['name'].apply(preprocess)
articles_data_aux['CompanyMentioned_processed'] = articles_data_aux['CompanyMentioned'].apply(preprocess)

# Extract processed names for matching
company_names = companies_data_aux['name_processed'].tolist()
article_names = articles_data_aux['CompanyMentioned_processed'].tolist()

# Threshold for considering a match (adjust as needed)
threshold = 90

# RapidFuzz match each article name against company names with threshold
matches = []
for article_idx, article_name in enumerate(article_names):
    result = process.extract(article_name, company_names, scorer=fuzz.token_sort_ratio, limit=100, score_cutoff=threshold)
    for match in result:
        company_index, score = match[2], match[1]  # index and score from RapidFuzz result
        matched_row = companies_data_aux.iloc[company_index].to_dict() | articles_data_aux.iloc[article_idx].to_dict() 
        matched_row['MatchScore'] = score
        matches.append(matched_row)

# Convert the list of matched rows to a DataFrame
result_df = pd.DataFrame(matches)

# Drop the processed columns to keep only original data if desired
result_df = result_df.drop(columns=['name_processed', 'CompanyMentioned_processed'])

# Sort by 'name' (from companies_data)
result_df = result_df.sort_values(by='name').drop_duplicates().reset_index(drop=True)

result_df.to_csv("../data/joined_articles_companies_fuzzy.csv", index=False)

# took 293 minutes to run

As a temporary fix to the problem above of duplicate articles I am going to remove duplicates from the results by article name becasue each article should only be matched with one company, but these will not nessecarily be the correct company that it is matched with.

In [3]:
fuzzy_matched = pd.read_csv("../data/joined_articles_companies_fuzzy.csv")

# drop duplicate rows by Headline and URL
fuzzy_matched.drop_duplicates(subset=['Headline', 'URL'], inplace=True)

fuzzy_matched.to_csv("../data/joined_articles_companies_fuzzy_no_duplicates.csv", index=False)

## Articles not matched

In [3]:
fuzzy_matched = pd.read_csv("../data/joined_articles_companies_fuzzy_no_duplicates.csv")

classified_articles = pd.read_csv("../data/articles_about_breaches_with_company_name.csv")

matched_company_names = fuzzy_matched['CompanyMentioned'].tolist()

# the companies in the "companymentioned" column that did not get matched to a company from the company dataset during fuzzy matching
not_matched = classified_articles[~classified_articles['CompanyMentioned'].isin(matched_company_names)]

not_matched.to_csv("../data/classified_articles_not_matched_to_companies_dataset.csv", index=False)