We break down error further, by industry and size

We expect that not every industry will actually have hits within the test set. This will suck for error analysis, but we will still be able to see the distribution of errors across industries.

In [13]:
import pandas as pd
import seaborn as sns
import numpy as np

from matplotlib import pyplot as plt
from rapidfuzz import process, fuzz

import re

In [31]:
# Load data
ml_output = pd.read_csv("../../classify/irr/test_set_gpt/decilm.csv",)
true_data = pd.read_csv("../../classify/irr/test_set_gpt/caitlyn.csv")

merged_data = pd.merge(ml_output, true_data, on=['Date', 'Publication', 'Headline'], how="inner", suffixes=('_ml', '_true'))
merged_data.set_index(['Date', 'Publication', 'Headline', 'URL'], inplace=True)

assert len(merged_data) == len(ml_output) == len(true_data) == 200

# Calculate where the ML model was correct
def to_bool(string:str) -> bool:
    string = str(string).strip().casefold()
    if string == 'nan':
        return False
    if string in ('true', 'yes', '1'):
        return True
    if string in ('false', 'no', '0'):
        return False
    
    # some random stackoverflow said not not is faster than bool()
    return not not string 

merged_data['correct_breach']  = merged_data['BreachMentioned_ml'].apply(to_bool) == merged_data['BreachMentioned_true'].apply(to_bool)
merged_data['correct_company'] = merged_data['CompanyMentioned_ml'].apply(to_bool) == merged_data['CompanyMentioned_true'].apply(to_bool)

merged_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,BreachMentioned_ml,CompanyMentioned_ml,BreachMentioned_true,CompanyMentioned_true,correct_breach,correct_company
Date,Publication,Headline,URL,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20210405,The Guardian,"India National record 103,558 new Covid cases in 24 hours",https://www.theguardian.com/world/2021/apr/05/india-reports-national-record-100000-new-covid-cases-in-24-hours,False,False,,,True,True
20130601,Washington Post,What the media forget about leaks,http://www.washingtonpost.com/opinions/journalists-trawling-for-leaks-should-be-willing-to-share-the-risks/2013/05/31/af8fb48a-c965-11e2-8da7-d274bc611a47_story.html,False,False,,,True,True
20080826,New York Times,Bits: AT&T Launches New Data Plans for iPhone Users Roaming Abroad,http://bits.blogs.nytimes.com/2008/08/26/att-launches-new-data-plans-for-iphone-users-roaming-abroad/index.html,False,AT&T,,AT&T,True,True
20140316,New York Times,Pilot Spoke to Air Controllers After Shutdown of Data System,http://www.nytimes.com/2014/03/17/world/asia/malaysia-airlines-flight.html?hp,False,False,,,True,True
20160330,CNBC,Illinois' epic budget deadlock sets dubious record,http://www.cnbc.com/2016/03/30/illinois-epic-budget-fail-sets-a-dubious-record.html,False,False,,,True,True
...,...,...,...,...,...,...,...,...,...
20160802,New York Times,3 Top D.N.C. Officials Leave in Wake of Email Breach,http://www.nytimes.com/2016/08/03/us/politics/dnc-email-hack-hillary-clinton-bernie-sanders.html,True,Government,yes,DNC,True,True
20180407,FOX,"7-year-old girl hikes Mount Kilimanjaro in honor of late dad, breaks record",http://www.foxnews.com/great-outdoors/2018/04/07/seven-year-old-girl-hikes-mount-kilimanjaro-in-honor-late-dad-breaks-record.html,False,False,,,True,True
20160817,Washington Post,AT&T’s new data plans trade one overage penalty for another,https://www.washingtonpost.com/news/the-switch/wp/2016/08/17/att-is-doing-away-with-overage-fees-in-these-new-data-plans/,False,AT&T,,AT&T,True,True
20140531,Daily Mail,Jaques Costeau grandson aims to break his famed grandfather's record with 31-day undersea voyage,http://www.dailymail.co.uk/news/article-2644778/Jaques-Costeau-grandson-aims-break-famed-grandfathers-record-31-day-undersea-voyage.html,False,False,,,True,True


In [12]:
# Match to companies, exactly as in process_result/companies.ipynb

companies_data = pd.read_csv("../../data/companies_sorted.csv")
companies_data = companies_data.dropna(subset=['name'], how='any').sort_values(by='name', key=lambda col: col.str.lower()).reset_index(drop=True)
companies_data

Unnamed: 0.1,Unnamed: 0,name,domain,year founded,industry,size range,locality,country,linkedin url,current employee estimate,total employee estimate
0,841950,! ausdrucksstark -,,,photography,5001 - 10000,,,linkedin.com/company/-ausdrucksstark--_2,3067,3229
1,3896268,! boost-your-sales !,boost-your-sales.eu,2014.0,education management,1 - 10,"dublin, dublin, ireland",ireland,linkedin.com/company/-boost-your-sales-,5,5
2,6115557,! cb repossessions !,cbrepossessions.com,2008.0,real estate,5001 - 10000,"torrevieja, valencia, spain",spain,linkedin.com/company/-cb-repossessions-,1879,2040
3,962323,! design e comunicação,exclamacaodesign.com,2008.0,marketing and advertising,1 - 10,"lisbon, lisbon, portugal",portugal,linkedin.com/company/-design-e-comunicação,3,3
4,2502661,! en ! en ! trainers,enentrainers.nl,,alternative dispute resolution,501 - 1000,,,linkedin.com/company/-en-en-trainers,222,239
...,...,...,...,...,...,...,...,...,...,...,...
7173418,2232733,💡 @1871chicago,1871.com,2012.0,internet,201 - 500,"chicago, illinois, united states",united states,linkedin.com/company/1871-com,150,308
7173419,1542285,💡 myhappyidea.com,myhappyidea.com,2015.0,internet,11 - 50,,,linkedin.com/company/myhappyidea-com,4,4
7173420,2601979,📲 takcam social media/digital marketing | info...,takcam.com,2011.0,marketing and advertising,1 - 10,"calgary, alberta, canada",canada,linkedin.com/company/takcam,1,1
7173421,4296555,📷 cm2b photography & design,cm2b.com,2008.0,photography,1 - 10,"salem, massachusetts, united states",united states,linkedin.com/company/cm2b-photography-&-design,0,1


In [None]:
# Merge company data to test data EXACTLY as we do for the real data
# This takes fucking forever (40m), god damn sorry for making you run this on 15k rows :(
    
def preprocess(text):
    # Remove punctuation and convert to lowercase
    if pd.isna(text):
        return ""
    return re.sub(r'[^\w\s]', '', text).lower()

threshold = 90

# Convert preprocessed names to lists
company_names = companies_data['name_processed'].tolist()
article_names = merged_data['name_processed'].tolist()

# Batch process matches using RapidFuzz cdist
match_scores = process.cdist(
    article_names, company_names, scorer=fuzz.token_sort_ratio, score_cutoff=threshold
)

# Convert match scores to a DataFrame for easy analysis
matches = []
for article_idx, article_matches in enumerate(match_scores):
    for company_idx, score in enumerate(article_matches):
        if score >= threshold:  # Filter based on threshold
            matched_row = (
                companies_data.iloc[company_idx].to_dict()
                | merged_data.iloc[article_idx].to_dict()
                | {"MatchScore": score}
            )
            matches.append(matched_row)

result_df = pd.DataFrame(matches)
result_df

Unnamed: 0.1,Unnamed: 0,name,domain,year founded,industry,size range,locality,country,linkedin url,current employee estimate,total employee estimate,name_processed,BreachMentioned_ml,CompanyMentioned_ml,BreachMentioned_true,CompanyMentioned_true,correct_breach,correct_company,MatchScore
0,6503210,a.t.t,aryatandisteb.com,2005.0,medical devices,11 - 50,,,linkedin.com/company/a.t.t,16,18,att,False,AT&T,,AT&T,True,True,100.0
1,3300741,at&t,att.com,1876.0,telecommunications,10001+,"dallas, texas, united states",united states,linkedin.com/company/at&t,115188,269659,att,False,AT&T,,AT&T,True,True,100.0
2,4147727,at+t,ariumconsulting.com,,telecommunications,51 - 200,,,linkedin.com/company/at-t,30,65,att,False,AT&T,,AT&T,True,True,100.0
3,4927581,att,attcomputer.nl,2009.0,information technology and services,1 - 10,"werkendam, noord-brabant, netherlands",netherlands,linkedin.com/company/attcomputer,1,1,att,False,AT&T,,AT&T,True,True,100.0
4,2139015,att,att.eu,2002.0,import and export,10001+,"inowrocław, kujawsko-pomorskie, poland",poland,linkedin.com/company/att,3464,4532,att,False,AT&T,,AT&T,True,True,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,6503210,a.t.t,aryatandisteb.com,2005.0,medical devices,11 - 50,,,linkedin.com/company/a.t.t,16,18,att,False,AT&T,,AT&T,True,True,100.0
220,3300741,at&t,att.com,1876.0,telecommunications,10001+,"dallas, texas, united states",united states,linkedin.com/company/at&t,115188,269659,att,False,AT&T,,AT&T,True,True,100.0
221,4147727,at+t,ariumconsulting.com,,telecommunications,51 - 200,,,linkedin.com/company/at-t,30,65,att,False,AT&T,,AT&T,True,True,100.0
222,4927581,att,attcomputer.nl,2009.0,information technology and services,1 - 10,"werkendam, noord-brabant, netherlands",netherlands,linkedin.com/company/attcomputer,1,1,att,False,AT&T,,AT&T,True,True,100.0


In [77]:
# Calculate f1 and irr per-industry and per-size
# table: industry	size range
from sklearn.metrics import f1_score, accuracy_score
import warnings

# qcut total employee estimate
result_df['size_range'] = pd.qcut(result_df['total employee estimate'], q=3, labels=['small', 'medium', 'large'])

# Calculate f1 and irr per-industry and per-size
industries = result_df['industry'].unique()
sizes = ['small', 'medium', 'large']


markdown = "|Industry|" + "|".join(map(str.title, sizes)) + "|\n"
markdown += "|---|" + "|".join(["---"] * len(sizes)) + "|\n"
for ind in industries:
    if ind == 'nan' or str(ind) == 'nan':
        continue
    
    markdown += f"|{ind}|"
    for size in sizes:
        sub_df = result_df[(result_df['industry'] == ind) & (result_df['size_range'] == size)]
        
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            f1_co = f1_score(sub_df['CompanyMentioned_true'].apply(to_bool), sub_df['CompanyMentioned_ml'].apply(to_bool))
            irr_co = accuracy_score(sub_df['CompanyMentioned_true'].apply(to_bool), sub_df['CompanyMentioned_ml'].apply(to_bool))
            
            f1_br = f1_score(sub_df['BreachMentioned_true'].apply(to_bool), sub_df['BreachMentioned_ml'].apply(to_bool))
            irr_br = accuracy_score(sub_df['BreachMentioned_true'].apply(to_bool), sub_df['BreachMentioned_ml'].apply(to_bool))
        
        markdown += f"Comp:  {f1_co:.2f} ({irr_co:.2f})<br>Breach: {f1_br:.2f} ({irr_br:.2f}) <br>N: {len(sub_df)}|" if len(sub_df) > 0 else "N/A|"
    markdown += f"\n"

# jupyter markdown
from IPython.display import display, Markdown
display(Markdown(markdown))

|Industry|Small|Medium|Large|
|---|---|---|---|
|medical devices|N/A|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 3|N/A|
|telecommunications|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 6|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 7|
|information technology and services|Comp:  0.92 (0.86)<br>Breach: 1.00 (1.00) <br>N: 7|Comp:  0.93 (0.88)<br>Breach: 1.00 (1.00) <br>N: 8|Comp:  0.67 (0.50)<br>Breach: 0.00 (0.83) <br>N: 6|
|import and export|Comp:  0.00 (0.00)<br>Breach: 0.00 (1.00) <br>N: 1|N/A|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 2|
|civil engineering|N/A|Comp:  0.40 (0.25)<br>Breach: 0.00 (0.75) <br>N: 4|N/A|
|security and investigations|N/A|N/A|Comp:  0.40 (0.25)<br>Breach: 0.00 (0.75) <br>N: 4|
|defense & space|Comp:  0.40 (0.25)<br>Breach: 0.00 (0.75) <br>N: 4|N/A|N/A|
|computer software|Comp:  0.00 (0.00)<br>Breach: 0.00 (1.00) <br>N: 1|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 9|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 3|
|management consulting|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 5|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|N/A|
|insurance|N/A|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|
|venture capital & private equity|N/A|N/A|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|
|electrical/electronic manufacturing|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|N/A|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|
|staffing and recruiting|N/A|N/A|Comp:  0.67 (0.50)<br>Breach: 0.00 (1.00) <br>N: 2|
|research|Comp:  0.67 (0.50)<br>Breach: 0.00 (1.00) <br>N: 2|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|N/A|
|market research|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 1|N/A|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|
|renewables & environment|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 2|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|N/A|
|computer & network security|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|N/A|N/A|
|chemicals|N/A|N/A|Comp:  1.00 (1.00)<br>Breach: 0.00 (0.20) <br>N: 5|
|internet|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 2|Comp:  0.67 (0.50)<br>Breach: 0.67 (0.50) <br>N: 2|Comp:  1.00 (1.00)<br>Breach: 0.44 (0.50) <br>N: 10|
|cosmetics|Comp:  1.00 (1.00)<br>Breach: 0.00 (0.20) <br>N: 5|N/A|N/A|
|design|N/A|N/A|Comp:  0.00 (0.00)<br>Breach: 0.00 (1.00) <br>N: 1|
|non-profit organization management|Comp:  0.00 (0.00)<br>Breach: 0.00 (0.00) <br>N: 1|Comp:  0.80 (0.67)<br>Breach: 0.00 (1.00) <br>N: 3|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 2|
|airlines/aviation|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 2|N/A|N/A|
|computer games|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 2|N/A|
|industrial automation|Comp:  0.67 (0.50)<br>Breach: 0.00 (1.00) <br>N: 2|N/A|N/A|
|consumer electronics|N/A|N/A|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 2|
|marketing and advertising|Comp:  0.93 (0.88)<br>Breach: 1.00 (1.00) <br>N: 8|Comp:  0.80 (0.67)<br>Breach: 0.86 (0.83) <br>N: 6|Comp:  0.80 (0.67)<br>Breach: 1.00 (1.00) <br>N: 3|
|consumer goods|N/A|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 2|N/A|
|business supplies and equipment|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 2|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|N/A|
|sports|N/A|N/A|Comp:  0.80 (0.67)<br>Breach: 0.00 (1.00) <br>N: 3|
|writing and editing|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 1|Comp:  1.00 (1.00)<br>Breach: 0.00 (0.00) <br>N: 1|N/A|
|food & beverages|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 2|N/A|N/A|
|government administration|N/A|N/A|Comp:  0.50 (0.33)<br>Breach: 1.00 (1.00) <br>N: 3|
|civic & social organization|Comp:  0.00 (0.00)<br>Breach: 0.00 (1.00) <br>N: 1|N/A|Comp:  0.67 (0.50)<br>Breach: 1.00 (1.00) <br>N: 2|
|alternative medicine|Comp:  0.67 (0.50)<br>Breach: 1.00 (1.00) <br>N: 2|N/A|N/A|
|human resources|Comp:  0.67 (0.50)<br>Breach: 1.00 (1.00) <br>N: 2|N/A|N/A|
|health, wellness and fitness|Comp:  0.50 (0.33)<br>Breach: 1.00 (1.00) <br>N: 3|N/A|N/A|
|law enforcement|N/A|Comp:  0.67 (0.50)<br>Breach: 0.00 (0.00) <br>N: 2|Comp:  0.67 (0.50)<br>Breach: 0.00 (0.00) <br>N: 2|
|accounting|N/A|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 1|
|consumer services|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|N/A|N/A|
|professional training & coaching|N/A|Comp:  0.67 (0.50)<br>Breach: 0.00 (1.00) <br>N: 2|N/A|
|apparel & fashion|Comp:  1.00 (1.00)<br>Breach: 0.67 (0.50) <br>N: 2|N/A|N/A|
|restaurants|N/A|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 2|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 3|
|retail|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 3|N/A|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 1|
|financial services|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 2|N/A|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|
|newspapers|N/A|Comp:  1.00 (1.00)<br>Breach: 0.00 (0.00) <br>N: 1|N/A|
|automotive|N/A|N/A|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 2|
|arts and crafts|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|N/A|N/A|
|investment management|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|N/A|N/A|
|oil & energy|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|N/A|N/A|
|railroad manufacture|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|N/A|N/A|
|construction|Comp:  1.00 (1.00)<br>Breach: 0.00 (0.00) <br>N: 1|N/A|N/A|
|leisure, travel & tourism|N/A|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|N/A|
|government relations|N/A|Comp:  0.00 (0.00)<br>Breach: 0.00 (0.00) <br>N: 1|N/A|
|warehousing|N/A|N/A|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|
|wholesale|N/A|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|N/A|
|transportation/trucking/railroad|Comp:  0.00 (0.00)<br>Breach: 0.00 (1.00) <br>N: 1|N/A|N/A|
|computer networking|N/A|N/A|Comp:  1.00 (1.00)<br>Breach: 0.00 (0.00) <br>N: 1|
|entertainment|N/A|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 1|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 1|
|aviation & aerospace|N/A|N/A|Comp:  0.00 (0.00)<br>Breach: 0.00 (1.00) <br>N: 1|
|executive office|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 1|N/A|N/A|
|higher education|N/A|N/A|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 1|
|printing|N/A|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 1|N/A|
|religious institutions|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|N/A|N/A|
|architecture & planning|N/A|Comp:  0.67 (0.50)<br>Breach: 0.67 (0.50) <br>N: 2|N/A|


In [None]:
# Let's merge some of the industries
# This is hardcoded based on the OTHER visualizations
final_industry_list = [
    'internet',
    'consumer electronics',
    'retail',
    'marketing and advertising',
    'automotive',
    'financial services',
    'entertainment',
    'computer software',
    'information technology and services',
    'restaurants',
    'airlines/aviation',
    'telecommunications',
    'oil & energy',
    'aviation & aerospace',
    'broadcast media',
]

def apply_final_industry(ind:str) -> str:
    if ind in final_industry_list:
        return ind
    return "other"

result_df['final_industry_list'] = result_df['industry'].apply(apply_final_industry)

markdown = "|Industry|" + "|".join(map(str.title, sizes)) + "|\n"
markdown += "|---|" + "|".join(["---"] * len(sizes)) + "|\n"
for ind in final_industry_list + ['other']:
    if ind == 'nan' or str(ind) == 'nan':
        continue
    
    markdown += f"|{ind}|"
    for size in sizes:
        sub_df = result_df[(result_df['final_industry_list'] == ind) & (result_df['size_range'] == size)]
        
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            f1_co = f1_score(sub_df['CompanyMentioned_true'].apply(to_bool), sub_df['CompanyMentioned_ml'].apply(to_bool))
            irr_co = accuracy_score(sub_df['CompanyMentioned_true'].apply(to_bool), sub_df['CompanyMentioned_ml'].apply(to_bool))
            
            f1_br = f1_score(sub_df['BreachMentioned_true'].apply(to_bool), sub_df['BreachMentioned_ml'].apply(to_bool))
            irr_br = accuracy_score(sub_df['BreachMentioned_true'].apply(to_bool), sub_df['BreachMentioned_ml'].apply(to_bool))
        
        markdown += f"Comp:  {f1_co:.2f} ({irr_co:.2f})<br>Breach: {f1_br:.2f} ({irr_br:.2f}) <br>N: {len(sub_df)}|" if len(sub_df) > 0 else "N/A|"
    markdown += f"\n"

# jupyter markdown
from IPython.display import display, Markdown
display(Markdown(markdown))

|Industry|Small|Medium|Large|
|---|---|---|---|
|internet|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 2|Comp:  0.67 (0.50)<br>Breach: 0.67 (0.50) <br>N: 2|Comp:  1.00 (1.00)<br>Breach: 0.44 (0.50) <br>N: 10|
|consumer electronics|N/A|N/A|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 2|
|retail|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 3|N/A|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 1|
|marketing and advertising|Comp:  0.93 (0.88)<br>Breach: 1.00 (1.00) <br>N: 8|Comp:  0.80 (0.67)<br>Breach: 0.86 (0.83) <br>N: 6|Comp:  0.80 (0.67)<br>Breach: 1.00 (1.00) <br>N: 3|
|automotive|N/A|N/A|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 2|
|financial services|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 2|N/A|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|
|entertainment|N/A|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 1|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 1|
|computer software|Comp:  0.00 (0.00)<br>Breach: 0.00 (1.00) <br>N: 1|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 9|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 3|
|information technology and services|Comp:  0.92 (0.86)<br>Breach: 1.00 (1.00) <br>N: 7|Comp:  0.93 (0.88)<br>Breach: 1.00 (1.00) <br>N: 8|Comp:  0.67 (0.50)<br>Breach: 0.00 (0.83) <br>N: 6|
|restaurants|N/A|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 2|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 3|
|airlines/aviation|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 2|N/A|N/A|
|telecommunications|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 6|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 7|
|oil & energy|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|N/A|N/A|
|aviation & aerospace|N/A|N/A|Comp:  0.00 (0.00)<br>Breach: 0.00 (1.00) <br>N: 1|
|broadcast media|N/A|N/A|N/A|
|other|Comp:  0.85 (0.74)<br>Breach: 0.78 (0.84) <br>N: 57|Comp:  0.86 (0.75)<br>Breach: 0.63 (0.78) <br>N: 32|Comp:  0.83 (0.71)<br>Breach: 0.64 (0.76) <br>N: 34|


In [80]:
# No longer hardcode it, pick the top 5 industries FROM AMONG THE TESTSET
N = 5
topn_industries = result_df['industry'].value_counts().head(N).index.tolist()

def apply_topn_industry(ind:str) -> str:
    if ind in topn_industries:
        return ind
    return "other"

result_df['topn_industry_list'] = result_df['industry'].apply(apply_topn_industry)

markdown = "|Industry|" + "|".join(map(str.title, sizes)) + "|\n"
markdown += "|---|" + "|".join(["---"] * len(sizes)) + "|\n"
for ind in topn_industries + ['other']:
    if ind == 'nan' or str(ind) == 'nan':
        continue
    
    markdown += f"|{ind}|"
    for size in sizes:
        sub_df = result_df[(result_df['topn_industry_list'] == ind) & (result_df['size_range'] == size)]
        
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            f1_co = f1_score(sub_df['CompanyMentioned_true'].apply(to_bool), sub_df['CompanyMentioned_ml'].apply(to_bool))
            irr_co = accuracy_score(sub_df['CompanyMentioned_true'].apply(to_bool), sub_df['CompanyMentioned_ml'].apply(to_bool))
            
            f1_br = f1_score(sub_df['BreachMentioned_true'].apply(to_bool), sub_df['BreachMentioned_ml'].apply(to_bool))
            irr_br = accuracy_score(sub_df['BreachMentioned_true'].apply(to_bool), sub_df['BreachMentioned_ml'].apply(to_bool))
        
        markdown += f"Comp:  {f1_co:.2f} ({irr_co:.2f})<br>Breach: {f1_br:.2f} ({irr_br:.2f}) <br>N: {len(sub_df)}|" if len(sub_df) > 0 else "N/A|"
    markdown += f"\n"

# jupyter markdown
from IPython.display import display, Markdown
display(Markdown(markdown))

|Industry|Small|Medium|Large|
|---|---|---|---|
|information technology and services|Comp:  0.92 (0.86)<br>Breach: 1.00 (1.00) <br>N: 7|Comp:  0.93 (0.88)<br>Breach: 1.00 (1.00) <br>N: 8|Comp:  0.67 (0.50)<br>Breach: 0.00 (0.83) <br>N: 6|
|marketing and advertising|Comp:  0.93 (0.88)<br>Breach: 1.00 (1.00) <br>N: 8|Comp:  0.80 (0.67)<br>Breach: 0.86 (0.83) <br>N: 6|Comp:  0.80 (0.67)<br>Breach: 1.00 (1.00) <br>N: 3|
|internet|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 2|Comp:  0.67 (0.50)<br>Breach: 0.67 (0.50) <br>N: 2|Comp:  1.00 (1.00)<br>Breach: 0.44 (0.50) <br>N: 10|
|telecommunications|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 1|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 6|Comp:  1.00 (1.00)<br>Breach: 0.00 (1.00) <br>N: 7|
|computer software|Comp:  0.00 (0.00)<br>Breach: 0.00 (1.00) <br>N: 1|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 9|Comp:  1.00 (1.00)<br>Breach: 1.00 (1.00) <br>N: 3|
|other|Comp:  0.87 (0.77)<br>Breach: 0.83 (0.86) <br>N: 65|Comp:  0.87 (0.77)<br>Breach: 0.72 (0.80) <br>N: 35|Comp:  0.86 (0.76)<br>Breach: 0.71 (0.82) <br>N: 45|
