In [1]:
# import packages to use
import pandas as pd
import requests       # HTTP protocol request package

In [2]:
# Function that checks the website exist
# Return 1 for existing website, 0 for non existing
def check_website_exists(url):
    try:
        response = requests.get(url, timeout=5)  # setting the timeout to 5 seconds
        return int(response.status_code == 200)
    except (requests.ConnectionError, requests.Timeout, requests.TooManyRedirects):  # catching TooManyRedirects too
        return 0


In [3]:
def company_filtering(dataset):
    """
    Function for filtering into existing and not existing company
    For further use, make sure to check the dataset has right columns
    The function will output as exporting csv files.
    """
    # Define company domain list (revenue 100 ~ 500)
    ## If there's many company, it takes a lot of time
    company = dataset[['Name','Email','Company','Domain']].dropna(subset=['Domain'])
    non_existing_co = pd.DataFrame(columns=['Name', 'Email', 'Company', 'Domain'])

    # Iterate over the rows of the Domain column, if the website doesn't exist, drop it.
    for index, row in company.iterrows():
        domain = row['Domain']
        if check_website_exists("http://" + domain) == 0:
            new_row = pd.Series(row, index=company.columns)
            # Add non-existing company row to the non-existing DataFrame
            non_existing_co = pd.concat([non_existing_co, pd.DataFrame([new_row])], ignore_index=True)
            # Drop the non-existing company row from the existing DataFrame
            company = company.drop(index, axis=0)
            

    # Reset the index for both DataFrames
    company.reset_index(drop=True, inplace=True)
    non_existing_co.reset_index(drop=True, inplace=True)

    return company, non_existing_co


# Data Filtering

Data filteration for each revenue of the companies
- Revenue 100 ~ 500 [✓]
- Revenue 501 ~ 1k [✓]
- Revenue 1k ~ 5k 
- Revenue 5k ~ 10k
- Revenue 10k ~ 25k
- Revenue 50k ~ 100k
- Revenue 100k ~ 200k
- Revenue 200k plus

---
### Revenue 100 ~ 500

In [7]:

rev_basic = pd.read_csv('/home/lettuce/WorkCode/SalesNavigator/linkedin_from_excel/dataset/revenue_100k_200k.csv')
rev_basic.head()

Unnamed: 0,Email,Name,Keyword,F4,Domain,Sales Revenue USD,Company,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,michelle@chipchak.com,Michelle,community,2022-12-09 00:37:27,chipchak.com,199694.0,chipchak,,,,...,,,,,,,,,,
1,doe_jane@aurabora.com,Jane Doe,giveback-keyword,2022-11-30 19:36:15,aurabora.com,199107.0,aurabora,,,,...,,,,,,,,,,
2,andrea@pakaapparel.com,Andrea,sustainable-keyword,2022-11-21 03:18:57,pakaapparel.com,199038.0,pakaapparel,,,,...,,,,,,,,,,
3,Christine.Sterling@melitta.com,Christine Sterling,sustainability-keyword,2022-12-09 00:52:11,melitta.com,198827.0,melitta,,,,...,,,,,,,,,,
4,22@celestialbodiez.com,22 (Celestialbodiez),community,2022-12-09 00:37:21,celestialbodiez.com,198751.0,celestialbodiez,,,,...,,,,,,,,,,


In [8]:
# Divide Datasheet into existing and nonexisting
existing, nonexisting = company_filtering(rev_basic)

In [9]:
# Export to CSV

existing.to_csv("/home/lettuce/WorkCode/SalesNavigator/linkedin_from_excel/company_result/company_100k_200k.csv", index=False)
nonexisting.to_csv("/home/lettuce/WorkCode/SalesNavigator/linkedin_from_excel/company_result/company_100k_200k_nonexist.csv", index=False)