In [1]:
# import packages to use
import pandas as pd
import requests       # HTTP protocol request package

In [2]:
# Function that checks the website exist
# Return 1 for existing website, 0 for non existing
def check_website_exists(url):
    try:
        response = requests.get(url)
        return int(response.status_code == 200)
    except requests.ConnectionError:
        return 0

# Data Filtering

Data filteration for each revenue of the companies
- Revenue 100 ~ 500 [✓]
- Revenue 1k ~ 5k 
- Revenue 5k ~ 10k
- Revenue 10k ~ 25k
- Revenue 50k ~ 100k
- Revenue 100k ~ 200k
- Revenue 200k plus

---
### Revenue 100 ~ 500

In [7]:
# Revenue 100 ~ 500
rev_basic = pd.read_csv('dataset/revenue_100_500.csv')
rev_basic.head()

Unnamed: 0,Email,Name,Keyword,F4,Domain,Sales Revenue USD,Company
0,cposner@lfsurf.com,C. Posner,foundation,11/21/22 3:21,lfsurf.com,498,lfsurf
1,claire@operationmasks.org,Claire,org,12/7/22 4:59,operationmasks.org,495,operationmasks
2,nasserally@nassfitness.com,Erally Nass,community,12/9/22 0:37,nassfitness.com,485,nassfitness
3,lino.miani@combatdiver.org,Lino Miani,org,12/7/22 4:59,combatdiver.org,483,combatdiver
4,annakarin@formerabeauty.com,Ann Akarin,foundation,11/21/22 3:21,formerabeauty.com,478,formerabeauty


In [27]:
domain_basic = rev_basic[['Name','Email','Company','Domain']].dropna(subset=['Domain'])

In [34]:
# Define company domain list (revenue 100 ~ 500)
## If there's many company, it takes a lot of time
domain_basic = rev_basic[['Name','Email','Company','Domain']].dropna(subset=['Domain'])

# Iterate over the rows of the Domain column, if the website doesn't exist, drop it.
for index, row in domain_basic.iterrows():
    domain = row['Domain']
    if check_website_exists("http://" + domain) == 0:
        domain_basic = domain_basic.drop(index, axis=0)

In [36]:
# Export to CSV file
domain_basic.to_csv("company_result/company_100_500.csv", index=False)

---
### Revenue 25k ~ 50k

In [44]:
# Revenue 100 ~ 500
rev_25to50 = pd.read_csv('dataset/revenue_25k_50k.csv')
rev_25to50.head()

Unnamed: 0,Email,Name,Keyword,F4,Domain,Sales Revenue USD,Company,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,Rob@nootkas.com,Rob,sustainable-keyword,2022-11-21 03:19:09,nootkas.com,49994.0,nootkas,,,,...,,,,,,,,,,
1,lauren@elmsstore.com.au,Lauren,carbon-click-software,2022-12-09 02:05:32,elmsstore.com.au,49959.0,elmsstore,,,,...,,,,,,,,,,
2,bobby@evaswild.com,Bobby,community,2022-12-09 00:37:26,evaswild.com,49956.0,evaswild,,,,...,,,,,,,,,,
3,danielle.francis@bloommoda.com,Danielle Francis,sustainable-keyword,2022-11-21 03:18:54,bloommoda.com,49910.0,bloommoda,,,,...,,,,,,,,,,
4,equinn@grantblvd.com,E. Quinn,sustainable-keyword,2022-11-21 03:18:57,grantblvd.com,49910.0,grantblvd,,,,...,,,,,,,,,,


In [48]:
# Define company domain list (revenue 25k ~ 50k)
## If there's many company, it takes a lot of time
domain_25to50 = rev_25to50[['Name','Email','Company','Domain']].dropna(subset=['Domain'])[100:300]
domain_25to50

Unnamed: 0,Name,Email,Company,Domain
100,Danica Sey,danicasey@foambrewers.com,foambrewers,foambrewers.com
101,T. Dhar,dhart@saigucosmetics.com,saigucosmetics,saigucosmetics.com
102,Courtney,courtney@liveoriginal.com,liveoriginal,liveoriginal.com
103,Sarah,Sarah@sarzastore.com,sarzastore,sarzastore.com
104,Ank Nath,nathank@forbiddenbike.com,forbiddenbike,forbiddenbike.com
...,...,...,...,...
295,Details (Infiniteflight),details@infiniteflight.com,infiniteflight,infiniteflight.com
296,Laura,laura@thenordicglow.com,thenordicglow,thenordicglow.com
297,Atreyi (Myverduracare),atreyi@myverduracare.com,myverduracare,myverduracare.com
298,Amy,Amy@lmhome.com.au,lmhome,lmhome.com.au


In [49]:
# Iterate over the rows of the Domain column, if the website doesn't exist, drop it.
for index, row in domain_25to50.iterrows():
    domain = row['Domain']
    if check_website_exists("http://" + domain) == 0:
        domain_25to50 = domain_25to50.drop(index, axis=0)

In [50]:
# Export to CSV file
domain_25to50.to_csv("company_result/company_25k_50k.csv", index=False)

---

---
## Old Version Code
### Revenue 100 ~ 500

In [23]:
# Define company domain list (revenue 100 ~ 500)
## If there's many company, it takes a lot of time
domain_basic = rev_basic[['Name','Email','Company','Domain']].dropna().to_list()

# Dictionary define. If the website exist, return 1 else return 0
dict_basic = {
    "website": domain_basic,
    "exist": [check_website_exists("http://" + domain) for domain in domain_basic]
}

# Convert to DataFrame to export to CSV
df_basic = pd.DataFrame(dict_basic)

In [24]:
# Export to CSV file
df_basic.to_csv("website_result/result_100_500.csv", index=False)

---
### Revenue 1k ~ 5k

In [3]:
# Revenue 1k ~ 5k
rev_1to5 = pd.read_csv('dataset/revenue_1k_5k.csv')

In [4]:
# Define company domain list (revenue 1k ~ 5k)
## If there's many company, it takes a lot of time
domain_1to5 = rev_1to5['Domain'].dropna().to_list()

# Dictionary define. If the website exist, return 1 else return 0
dict_1to5 = {
    "website": domain_1to5,
    "exist": [check_website_exists("http://" + domain) for domain in domain_1to5]
}

# Convert to DataFrame to export to CSV
df_1to5 = pd.DataFrame(dict_1to5)

In [5]:
# Export to CSV file
df_1to5.to_csv("website_result/result_1k_5k.csv", index=False)