Last Update: Feb 6, 2024

In [333]:
import pandas as pd
import tldextract as domain_extract
import sys
from datetime import datetime

Update `filename` (Rmb to include csv)

In [334]:
# Read Excel File
filename = "Test.csv"

Update `company_list`, `industry list` (Rmb to include csv)

In [335]:
# Read Excel File
company_list = "Company to Exclude.csv"

industry_list = "Industry ICP Keywords.csv"

#### Read all relevant csv files and subset the required columns

In [336]:
df = pd.read_csv(filename)

comp_to_exclude = list(pd.read_csv(company_list)['Company Domain'].unique())

industry_to_exclude = list(pd.read_csv(industry_list)['Industry Keyword'].unique())

In [337]:
df = df[['Company Name', 'Website', 'Revenue (in 000s USD)', 'Revenue Range (in USD)',
         'Primary Industry', 'Primary Sub-Industry', 'All Industries', 'All Sub-Industries',
         'LinkedIn Company Profile URL', 'Company Country']]

#### Preprocessing (Remove/Add necessary columns for Company ICP Check)

In [338]:
# Column to track source
df['Source-Checked On (YYYYMMDD)'] = filename + "-" + str(datetime.today().strftime('%Y%m%d'))

In [339]:
# Get domain
df['Company Domain'] = df['Website'].apply(lambda x: domain_extract.extract(x).domain + "." + domain_extract.extract(x).suffix)

In [340]:
# Add column for industry standardized, lead segment HS & Industry Re-Segmentation
df['Industry (Standardized)'] = ""
df['Lead Segment HS'] = ""
df['Industry Re-Segmentation'] = ""

In [341]:
# Add column for validation/check
df['Valid/Invalid'] = ""
df['Remark'] = ""
df['Industry_ICP_Check_List'] = ""

#### Company ICP Check

For Company ICP, the following are the stuff that we will need to check:
1. Revenue > $1B
2. Company to be located in US/CA
3. Do not include companies that the client has identified not to include
4. Standardized the industry based on client's ICP

##### Industry ICP Check (Compile Results into List for Overall ICP Check)

In [342]:
# Compile all four industry columns into one list

industry_col = ['Primary Industry', 'Primary Sub-Industry', 'All Industries', 'All Sub-Industries']

df['ZI Industry List'] = [[] for _ in range(len(df))]

for col in industry_col:
    df['ZI Industry List'] = df['ZI Industry List'] + df[col].astype(str).str.split(";")

In [343]:
# Services & Healthcare ICP Segments and Industry Re-Segmentation Mapping
services_ICP_segment = ['Consumer Services', 'Retail', 'Hospitality', 
                        'Finance', 'Manufacturing', 'Insurance', 'Media & Internet']

healthcare_ICP_segment = ['Ambulance Services', 'Blood & Organ Banks', 'Elderly Care Services', 'Medical Laboratories & Imaging Center',
                          'Dental Offices', 'Medical & Surgical Hospitals', 'Medical Specialists', 'Physicians Clinics', 
                          'Hospitals & Physicians Clinics'] 
# Take note that "Hospitals & Physicians Clinics" is not ICP segment but its a keyword for "Physicians Clinics" 

ind_resegment = {'Retail + CPG' : ['Consumer Services', 'Retail'],
                 'Hospitality': ['Hospitality'],
                 'Finance & Insurance': ['Finance', 'Insurance'],
                 'Manufacturing': ['Manufacturing'], 
                 'Media & Internet': ['General'], 
                 'Healthcare': healthcare_ICP_segment}

In [344]:
# Create a function to check industry ICP

"""
Logic for Industry ICP Check: *** When do conditional checking, remember to use .lower() function ***
1. Split the 4 industry columns and combine it as a string 
2. If healthcare keywords is found, append into the ind_result
3. If services keywords is found, append into the ind_result
4. Determine which keywords are conditional
    - For Retail, Building MaterialS is Acceptable
    - For Retail, Telecommunication Equipment is Acceptable
5. Determine which keywords are the most important one to remove
6. Label those that do not have keywords in the services_ICP_segment
"""

def industry_check(row, healthcare_ICP, services_ICP, industry_to_exclude):

    """
    This function will check for the industry ICP based on the following criteria and return a list that can be used for another function.
    1. Loop through the list of industry from the 4 industry columns in ZI
    2. If healthcare keywords is found, append the keywords into a list (ind_result). Also append "healthcare".
    3. If services keywords is found, append the keywords into ind_result
    4. Conditional keywords
        - For Retail, "Building Material" & "Telecommunication" are acceptable. Append "retail check" into ind_result if found
        - If the keyword "Banking" is found, append "banking check" into ind_result
    5. If keywords in the industry_to_exclude are found, append "invalid" into ind_result
    6. If the keywords not found in the healthcare_ICP, services_ICP, and industry_to_exclude, append "manual check" into ind_result
    """

    # create an empty list to store the results 
    ind_result = []

    for industry in row: # Step 1
        ind_lower = industry.lower()

        if ind_lower in [ind_ICP.lower() for ind_ICP in healthcare_ICP]: # Step 2
            ind_result.append(ind_lower)
            ind_result.append("healthcare")
        elif ind_lower in [ind_ICP.lower() for ind_ICP in services_ICP]: # Step 3
            ind_result.append(ind_lower)
        elif ind_lower in ['building materials', 'telecommunication equipment']: # Step 4.1
            ind_result.append("retail check")
        elif "banking" in ind_lower: # Step 4.2
            ind_result.append("banking check") 
        elif ind_lower in [invalid_ind.lower() for invalid_ind in industry_to_exclude]: # Step 5
            ind_result.append("invalid")
    
    # Step 6
    if len(ind_result) == 0: 
        ind_result.append("manual check")

    return ind_result


In [345]:
# Update the Industry_ICP_Check_List column
df['Industry_ICP_Check_List'] = df['ZI Industry List'].apply(lambda x: industry_check(x, healthcare_ICP_segment, services_ICP_segment, industry_to_exclude))

##### Overall ICP Check

In [346]:
# Overall ICP Check
def company_icp_check(dataset, healthcare_ICP, services_ICP, company_list, resegmentation_dict):

    # ------------------------------------------------------------------------------------------

    # Revenue ICP Check
    def check_revenue(row):    
        if row['Revenue (in 000s USD)'] < 1000000:
            return "Invalid Revenue Range"
        else:
            return ""
    
    dataset['Remark'] = dataset.apply(check_revenue, axis=1)

    # ------------------------------------------------------------------------------------------
        
    # Check for location
    country_ICP = ["United States", "Canada"]

    def country_check(row):
        # only label those that have revenue > 100,000
        if row['Remark'] != "":
            return row['Remark']
        elif row['Company Country'] not in country_ICP:
            return "Company Not In US/CA"
        else:
            return ""

    dataset['Remark'] = dataset.apply(country_check, axis=1)

    # ------------------------------------------------------------------------------------------

    # Check for company to exclude
    
    def exclude_company(row):
        if row['Remark'] != "":
            return row['Remark']
        elif row['Company Domain'] in company_list:
            return "Company In Client List"
        else:
            return ""
        
    dataset['Remark'] = dataset.apply(exclude_company, axis=1)

    # ------------------------------------------------------------------------------------------

    # Check for Industry ICP
    def industry_ICP_check(row):
        if row['Remark'] != "":
            return row['Remark']
        elif "healthcare" in row['Industry_ICP_Check_List']:
            return ""
        elif "manual check" in row['Industry_ICP_Check_List']:
            return "Manual Check for Industry"
        elif ("retail check" in row['Industry_ICP_Check_List']) and ("retail" not in row['Industry_ICP_Check_List']):
            return "Invalid Sub-Industry"
        elif ("banking check" in row['Industry_ICP_Check_List']) and (row['Revenue (in 000s USD)'] > 15000000):
            return "Invalid Sub-Industry (Banking with Revenue > $15B)"
        elif "invalid" in row['Industry_ICP_Check_List']:
            return "Invalid Sub-Industry"
        else:
            return ""
    
    dataset['Remark'] = dataset.apply(industry_ICP_check, axis=1)

    # ------------------------------------------------------------------------------------------

    # Standardized Industry
    def industry_standardized(row):
        if row['Remark'] == "Manual Check for Industry":
            return "Manual Check for Industry"
        elif row['Remark'] != "":
            return "Non ICP"
        
        # temp holder to consider healthcare keywords appearing in services
        ind_temp = []

        for ind in row['Industry_ICP_Check_List']: # To check for healthcare keyword first 

            ind_title = ind.title()
            
            if (ind_title in healthcare_ICP) and (ind_title == "Hospitals & Physicians Clinics"): # Check for "Hospitals & Physicians Clinics" keyword
                ind_temp.append("Physicians Clinics")
            elif ind_title in healthcare_ICP: # Check for healthcare
                ind_temp.append(ind_title)
            
        if len(ind_temp) == 0:
            for ind in row['Industry_ICP_Check_List']: # If there are no healthcare keyword appear, then only check for services
                
                ind_title = ind.title()

                if ind_title in services_ICP: 
                    ind_temp.append(ind_title)
        
        return ind_temp[0]
                
    dataset['Industry (Standardized)'] = dataset.apply(industry_standardized, axis=1)

    # ------------------------------------------------------------------------------------------

    # Industry Re-Segmentation
    def resegmentation_ICP(row):
        if row['Remark'] == "Manual Check for Industry":
            return ""
        elif row['Remark'] != "":
            return "Non ICP"
        
        for seg, ind in resegmentation_dict.items():
            if row['Industry (Standardized)'] in ind:
                return seg

    dataset['Industry Re-Segmentation'] =  dataset.apply(resegmentation_ICP, axis=1)

    # ------------------------------------------------------------------------------------------

    # Lead Segment HS
    def lead_segment_hs(row):
        if row['Remark'] == "Manual Check for Industry":
            return ""
        elif row['Remark'] != "":
            return "Non ICP"
        elif row['Industry Re-Segmentation'] == "Healthcare":
            return "Healthcare"
        else:
            return "Services"
        
    dataset['Lead Segment HS'] = dataset.apply(lead_segment_hs, axis=1)

    # ------------------------------------------------------------------------------------------

    # Label Valid/Invalid column
    def valid_invalid(row):
        if row == "Manual Check for Industry":
            return ""
        elif row == "":
            return "Valid"
        else:
            return "Invalid"

    dataset['Valid/Invalid'] = dataset['Remark'].apply(lambda x: valid_invalid(x))

    return

In [347]:
company_icp_check(df, healthcare_ICP_segment, services_ICP_segment, comp_to_exclude, ind_resegment)

#### Get Relevant Columns and Export to CSV

In [348]:
new_col = ['Company Name', 'Website', 'Company Domain', 'Revenue (in 000s USD)', 'Revenue Range (in USD)',
         'Primary Industry', 'Primary Sub-Industry', 'All Industries', 'All Sub-Industries', 
         'Industry (Standardized)', 'Lead Segment HS', 'Industry Re-Segmentation', 
         'LinkedIn Company Profile URL', 'Company Country', 
         'Valid/Invalid', 'Remark', 'Source-Checked On (YYYYMMDD)']

In [349]:
df = df[new_col]

In [350]:
output_filename = filename.split(".csv")[0] + "_Output V2.csv"
df.to_csv(output_filename, index=False)