In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px

# add the src directory to the path
import sys
sys.path.append('../src')

from data_loader import load_data
from config import settings

df = load_data(settings.FILE_NAME)

In [5]:
taxonomy_info = ["Description", "Industry", "Category Groups", "Tags"]

In [7]:
df[taxonomy_info].describe()

Unnamed: 0,Description,Industry,Category Groups,Tags
count,5000,4999,4775,3292
unique,4994,183,2060,3051
top,Airbase serves as an all-in-one platform for s...,Software Development,Health Care,saas
freq,2,764,116,178


In [72]:
df["Category Groups"]

Specter - ID
5e3a8f19040ca7b0c6f03071                                                  NaN
5e3a8f19040ca7b0c6f02f79    Content and Publishing, Information Technology...
5e3a8f47040ca7b0c6f10460    Financial Services, Lending and Investments, S...
62c2c65f0405ea8a58c3718a    Content and Publishing, Hardware, Manufacturin...
5fc74452bf7e53924df86816                        Health Care, Software, Sports
                                                  ...                        
61d5d3fcdb415a1b7f41bc81    Clothing and Apparel, Commerce and Shopping, C...
61ad0b56d0efcf20c6ecf0fc            Education, Information Technology, Sports
6164492cc3c073075bd6634d                                             Software
61a6fddfd0efcf20c62a9dee                     Information Technology, Software
5e3a7f2b0aa7a3270a55ee2d    Artificial Intelligence, Data and Analytics, H...
Name: Category Groups, Length: 5001, dtype: object

### Industry column

In [35]:
# look at industry - have we got labelled data?
industry_counts = df["Industry"].value_counts().to_frame().reset_index()
industry_counts.columns = ["Industry", "Count"]

# keywords for labelled data
keywords = ['finance', 'financial', 'health']

# find where keywords appear in Industry column (lower case)
mesh = industry_counts["Industry"].str.lower().str.contains('|'.join(keywords))
industry_counts[mesh]

Unnamed: 0,Industry,Count
3,Financial Services,402
11,Hospitals and Health Care,86
28,Mental Health Care,34
159,Health and Human Services,1
167,Retail Health and Personal Care Products,1
175,Public Health,1


Have existing industry labels for finance and health, although these industries are a sub-category of the new taxonomy of interest. 

- `Financial Services` is a sub-industry of **Finance**
- `Hospitals and Health Care`, `Mental Health Care` etc. are sub-industries of **Health**


### Category Groups column

In [75]:
# look at category groups - have we got labelled data?
category_counts = df["Category Groups"].value_counts().to_frame().reset_index()
category_counts.columns = ["Category", "Count"]

# keywords for labelled data
keywords = ['finance', 'financial', 'health']

# find where keywords appear in Industry column (lower case)
mesh = category_counts["Category"].str.lower().str.contains('|'.join(keywords))
category_counts[mesh]

Unnamed: 0,Category,Count
0,Health Care,116
1,Financial Services,99
2,"Financial Services, Lending and Investments",76
8,"Financial Services, Payments, Software",35
10,"Financial Services, Other, Payments, Software",33
...,...,...
2036,"Clothing and Apparel, Commerce and Shopping, D...",1
2045,"Commerce and Shopping, Financial Services, Inf...",1
2047,"Biotechnology, Health Care",1
2049,"Financial Services, Information Technology, Ot...",1


In [82]:
# keywords for labelled data
keywords = ['financial', 'health']

# find where keywords appear in Industry column (lower case)
mesh = (category_counts["Category"].str.lower().str.contains('financial')) & (category_counts["Category"].str.lower().str.contains('health'))

print(f"We have: {len(category_counts[mesh])} examples of financial & health in the 'Category Groups' column")

category_counts[mesh]


We have: 18 examples of financial & health in the 'Category Groups' column


Unnamed: 0,Category,Count
75,"Financial Services, Health Care",8
203,"Administrative Services, Financial Services, H...",4
572,"Financial Services, Health Care, Software",2
603,"Financial Services, Health Care, Sports",1
801,"Financial Services, Health Care, Lending and I...",1
837,"Financial Services, Health Care, Internet Serv...",1
859,"Financial Services, Health Care, Professional ...",1
1064,"Financial Services, Health Care, Sales and Mar...",1
1139,"Administrative Services, Financial Services, H...",1
1295,"Administrative Services, Financial Services, H...",1


### Tags column

In [71]:
# split tags by comma
tags = df["Tags"].str.split(",", expand=True).stack().reset_index(level=1, drop=True).to_frame("Tags")
# find keyword matches
mapping = {}
for keyword in keywords:
    spector_ids = tags[tags["Tags"].str.lower().str.contains(keyword)].index.unique()
    mapping[keyword] = spector_ids

# find the number of tags for each keyword
for keyword in keywords:
    print(f"{keyword}: {len(mapping[keyword])}")

# example
df.loc[mapping["finance"]][taxonomy_info].head()

finance: 110
financial: 107
health: 178


Unnamed: 0_level_0,Description,Industry,Category Groups,Tags
Specter - ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5e3a8f19040ca7b0c6f02e40,5Paisa know that your money is precious and ha...,Financial Services,"Financial Services, Privacy and Security","investing, insurance, mutual fund, discount br..."
60422d084ddab8d8bf2ad251,ADAN is a professional body for digital assets...,Civic and Social Organizations,,"blockchain, cryptocurrencies, actifs numérique..."
60226d1d4ddab8d8bf9a632c,AgriBazaar is a one-stop agritech platform tha...,Internet Publishing,"Agriculture and Farming, Apps, Commerce and Sh...","agriculutre, online marketplace, auctions, mar..."
5e3a8f49040ca7b0c6f12efe,AIIB is a multilateral development bank that a...,International Trade and Development,"Financial Services, Lending and Investments","infrastructure, sustainable infrastructure, cr..."
605884964ddab8d8bf804f4c,Airrange makes Excel easy. Turn your Excel 365...,IT Services and IT Consulting,Software,"microsoft 365, excel, data management, collabo..."
