In [1]:
import pandas as pd
import numpy as np

COMPANIES_LIST_FOLDER = "data/companies_lists/"

In [51]:
# load Microsite into a dataframe called microsite_df
microsite_df = pd.read_csv(COMPANIES_LIST_FOLDER + "NFX-Microsite-manual-merge.csv")

MICROSITE_COL_MAPPING = {
    "Company Name": "Company",
    "Active Investors": "Notable Investors",
}

# For each key in MICROSITE_COL_MAPPING, rename the column in microsite_df
for key in MICROSITE_COL_MAPPING:
    microsite_df.rename(columns={key: MICROSITE_COL_MAPPING[key]}, inplace=True)

# Remove the $ from the value in "Funding", convert the "Funding" column to float 
# and multiply the number in "Funding" by 1,000,000
microsite_df["Funding (est $)"] = microsite_df["Funding (est $)"].str.replace("$", "")
microsite_df["Funding (est $)"] = microsite_df["Funding (est $)"].str.replace(",", "")
microsite_df["Funding (est $)"] = microsite_df["Funding (est $)"].astype(float)
microsite_df["Funding (est $)"] = microsite_df["Funding (est $)"] * 1000000

# Change all the non-finite values in "Founded" to NaN
microsite_df["Founded"] = microsite_df["Founded"].replace([np.nan, np.inf, -np.inf], np.nan)
# Convert the column to integers
microsite_df["Founded"] = microsite_df["Founded"].astype(float).astype(pd.Int32Dtype(), errors='ignore')

microsite_df.head()

Unnamed: 0,Company,Funding (est $),Notable Investors,Headcount,URL,Description,Category,Focus,Value Chain Layer,Modality,...,Active?,Founded,HQ,Logo,Founders,Last Round,Valuation,Business Model,Open Source?,Unnamed: 21
0,10Web,4000000000000.0,"Sierra Ventures, AI Fund",,https://10web.io/,,Code,Website Generation,,,...,,,Other US,,,,,,,
1,Abridge,27500000000000.0,"Bessemer Venture Partners, Union Square Ventur...",,https://www.abridge.com/,,Summarization,Healthcare conversation documentation,,,...,,,Other US,,,,,,,
2,ABtesting.ai,,,,https://abtesting.ai/,,Text,Marketing & A/B Testing,,,...,,,Europe,,,,,,,
3,Accomplice,520000000000.0,TinySeed,,https://accomplice.ai,,Image,AI-generated stock photos,,,...,,,Other US,,,,,,,
4,Ada,190620600000000.0,"Creative Destruction Lab (CDL), Tiger Global M...",,https://www.ada.cx/,,Chatbot/Conversational AI,Automated Virtual Agents,,,...,,,North America (excl. US),,,,,,,


In [52]:
# print the total number of companies in each dataframe
print(f"{len(microsite_df)} Companies")

# Check for duplicate company names in microsite_df
print(f"{len(microsite_df[microsite_df['Company'].duplicated()])} duplicate companies")

800 Companies
76 duplicate companies


In [53]:
# Check for duplicate company names
duplicate_companies = microsite_df[microsite_df.duplicated(subset="Company", keep=False)]["Company"].unique()

# Remove the rows with an empty "Description" column for each duplicate company
for company in duplicate_companies:
    company_rows = microsite_df[microsite_df["Company"] == company]
    empty_description_rows = company_rows[company_rows['Description'].isnull() | (company_rows['Description'] == '')]
    if not empty_description_rows.empty:
        microsite_df = microsite_df.drop(empty_description_rows.index)

len(microsite_df)

723

In [42]:
# Save combined_df in a csv file
microsite_df.to_csv(COMPANIES_LIST_FOLDER + "combined.csv", index=False)