# Webscraping 2.0

Joseph Nelson, DC

In today's codealong, I'll walkthrough how to build a scraper using urllib and BeautifulSoup. We'll discover the problems we discussed in the lesson readme associated with doing so, and we'll remedy this problem using a headless browser called Selenium.

In [90]:
# import
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup
import urllib

## Data_Analyst_DC_60_entry

In [91]:
driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$60,000&l=Washington,+DC&radius=50&explvl=entry_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 60000
    experience = "Entry_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_DC_60_entry = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_DC_60_entry

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,GEICO,"Chevy Chase, MD",60000.0,Data_Analyst,Entry_level
1,Evolent Health,"Arlington, VA 22203",60000.0,Data_Analyst,Entry_level
2,Datalogic Solutions Inc,"Chantilly, VA",60000.0,Data_Analyst,Entry_level
3,Cipher3,"Washington, DC",60000.0,Data_Analyst,Entry_level
4,"Interfolio, Inc","Washington, DC 20036 (Downtown area)",60000.0,Data_Analyst,Entry_level
5,JLL,"Washington, DC",60000.0,Data_Analyst,Entry_level
6,Goodwill Excel Center - DC,"Washington, DC",60000.0,Data_Analyst,Entry_level
7,Booz Allen Hamilton,"Washington, DC",60000.0,Data_Analyst,Entry_level
8,Grant Thornton,"Washington, DC 20036 (Downtown area)",60000.0,Data_Analyst,Entry_level
9,FAAZ Consulting LLC,"Washington, DC",60000.0,Data_Analyst,Entry_level


## Data_Analyst_DC_60_mid

In [92]:
driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$60,000%2B&l=Washington,+DC&radius=50&explvl=mid_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 60000
    experience = "Mid_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_DC_60_mid = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_DC_60_mid

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,ICF International,"Rockville, MD",60000.0,Data_Analyst,Mid_level
1,Truth Initiative,"Washington, DC 20001",60000.0,Data_Analyst,Mid_level
2,Net ESolutions Corporation (NETE),"Rockville, MD",60000.0,Data_Analyst,Mid_level
3,New Beginnings LLC,"Washington, DC 20002 (Capitol Hill area)",60000.0,Data_Analyst,Mid_level
4,Dixon Hughes Goodman LLP,"Washington, DC 20016 (AU-Tenleytown area)",60000.0,Data_Analyst,Mid_level
5,The World Bank,"Washington, DC",60000.0,Data_Analyst,Mid_level
6,Compumatics Group,"New Carrollton, MD",60000.0,Data_Analyst,Mid_level
7,DIT Inc,"Washington, DC",60000.0,Data_Analyst,Mid_level
8,Inforeem,"Washington, DC",60000.0,Data_Analyst,Mid_level
9,BCT Partners,"Washington, DC",60000.0,Data_Analyst,Mid_level


## Data_Analyst_DC_60_senior

In [93]:
driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$60,000%2B&l=Washington,+DC&radius=50&explvl=senior_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 60000
    experience = "Senior_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_DC_60_senior = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_DC_60_senior

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,Deloitte,"Arlington, VA 22209",60000.0,Data_Analyst,Senior_level
1,The BOSS Group,"Washington, DC",60000.0,Data_Analyst,Senior_level
2,General Dynamics Information Technology,"Washington, DC",60000.0,Data_Analyst,Senior_level
3,Booz Allen Hamilton,"Washington, DC",60000.0,Data_Analyst,Senior_level
4,DataNet Systems Corporation,"Washington, DC",60000.0,Data_Analyst,Senior_level
5,ANSER,"Arlington, VA 22206 (Fairlington-Shirlington a...",60000.0,Data_Analyst,Senior_level
6,"Group Benefit Services, Inc.","Cockeysville, MD 21030",60000.0,Data_Analyst,Senior_level
7,Indev,"Washington, DC",60000.0,Data_Analyst,Senior_level
8,General Dynamics Information Technology,"Washington, DC",60000.0,Data_Analyst,Senior_level
9,General Dynamics Information Technology,"Towson, MD",60000.0,Data_Analyst,Senior_level


## Data_Analyst_DC_70_entry

In [94]:
driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$70,000&l=Washington,+DC&radius=50&explvl=entry_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 70000
    experience = "Entry_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_DC_70_entry = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_DC_70_entry

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,GEICO,"Chevy Chase, MD",70000.0,Data_Analyst,Entry_level
1,"The Upside Travel Company, LLC","Washington, DC",70000.0,Data_Analyst,Entry_level
2,Predictive Science,United States,70000.0,Data_Analyst,Entry_level
3,Cipher3,"Washington, DC",70000.0,Data_Analyst,Entry_level
4,"Interfolio, Inc","Washington, DC 20036 (Downtown area)",70000.0,Data_Analyst,Entry_level
5,JLL,"Washington, DC",70000.0,Data_Analyst,Entry_level
6,Booz Allen Hamilton,"Washington, DC",70000.0,Data_Analyst,Entry_level
7,Grant Thornton,"Washington, DC 20036 (Downtown area)",70000.0,Data_Analyst,Entry_level
8,FAAZ Consulting LLC,"Washington, DC",70000.0,Data_Analyst,Entry_level
9,Smithsonian Institution,"Washington, DC",70000.0,Data_Analyst,Entry_level


## Data_Analyst_DC_70_mid

In [95]:
driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$70,000&l=Washington,+DC&radius=50&explvl=mid_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 70000
    experience = "Mid_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_DC_70_mid = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_DC_70_mid

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,Bay State,"Washington, DC",70000.0,Data_Analyst,Mid_level
1,OmniSolve Inc,"Washington, DC",70000.0,Data_Analyst,Mid_level
2,Salient CRGT,"Washington, DC",70000.0,Data_Analyst,Mid_level
3,Compumatics Group,"New Carrollton, MD",70000.0,Data_Analyst,Mid_level
4,The World Bank,"Washington, DC",70000.0,Data_Analyst,Mid_level
5,Dixon Hughes Goodman LLP,"Washington, DC 20016 (AU-Tenleytown area)",70000.0,Data_Analyst,Mid_level
6,DIT Inc,"Washington, DC",70000.0,Data_Analyst,Mid_level
7,Inforeem,"Washington, DC",70000.0,Data_Analyst,Mid_level
8,BCT Partners,"Washington, DC",70000.0,Data_Analyst,Mid_level
9,CAVA,"Washington, DC 20009 (U Street Corridor area)",70000.0,Data_Analyst,Mid_level


## Data_Analyst_DC_70_senior

In [96]:
driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$70,000&l=Washington,+DC&radius=50&explvl=senior_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 70000
    experience = "Senior_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_DC_70_senior = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_DC_70_senior

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,Deloitte,"Arlington, VA 22209",70000.0,Data_Analyst,Senior_level
1,Salient CRGT,"Washington, DC",70000.0,Data_Analyst,Senior_level
2,General Dynamics Information Technology,"Springfield, VA",70000.0,Data_Analyst,Senior_level
3,Booz Allen Hamilton,"Washington, DC",70000.0,Data_Analyst,Senior_level
4,DataNet Systems Corporation,"Washington, DC",70000.0,Data_Analyst,Senior_level
5,"Group Benefit Services, Inc.","Cockeysville, MD 21030",70000.0,Data_Analyst,Senior_level
6,ANSER,"Arlington, VA 22206 (Fairlington-Shirlington a...",70000.0,Data_Analyst,Senior_level
7,General Dynamics Information Technology,"Towson, MD",70000.0,Data_Analyst,Senior_level
8,Indev,"Washington, DC",70000.0,Data_Analyst,Senior_level
9,PricewaterhouseCoopers LLC,"Washington, DC 20001 (Shaw area)",70000.0,Data_Analyst,Senior_level


## extra

In [97]:
df = pd.DataFrame({'Date': ['somedate', 'somedate'],
                   'location': ['united_kingdom_london', 'united_state_newyork'],
                   'occurence': [5, 5]})

In [98]:
df

Unnamed: 0,Date,location,occurence
0,somedate,united_kingdom_london,5
1,somedate,united_state_newyork,5


In [99]:
df['Country'] = df['location'].str.rpartition('_')[0].str.replace("_", " ")
df['City']    = df['location'].str.rpartition('_')[2]
df[['Date','Country', 'City', 'occurence']]


Unnamed: 0,Date,Country,City,occurence
0,somedate,united kingdom,london,5
1,somedate,united state,newyork,5


## Data_Analyst_DC_75_entry

In [100]:
driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$75,000&l=Washington,+DC&radius=50&explvl=entry_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 75000
    experience = "Entry_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_DC_75_entry = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_DC_75_entry

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,GEICO,"Chevy Chase, MD",75000.0,Data_Analyst,Entry_level
1,"The Upside Travel Company, LLC","Washington, DC",75000.0,Data_Analyst,Entry_level
2,Datalogic Solutions Inc,"Chantilly, VA",75000.0,Data_Analyst,Entry_level
3,Cipher3,"Washington, DC",75000.0,Data_Analyst,Entry_level
4,Booz Allen Hamilton,"Washington, DC",75000.0,Data_Analyst,Entry_level
5,FAAZ Consulting LLC,"Washington, DC",75000.0,Data_Analyst,Entry_level
6,Booz Allen Hamilton,"Alexandria, VA",75000.0,Data_Analyst,Entry_level
7,Accenture,"Rosslyn, VA",75000.0,Data_Analyst,Entry_level
8,Booz Allen Hamilton,"McLean, VA",75000.0,Data_Analyst,Entry_level
9,Advanced Software Systems,"Arlington, VA",75000.0,Data_Analyst,Entry_level


## Data_Analyst_DC_75_mid

In [101]:
driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$75,000&l=Washington,+DC&radius=50&explvl=mid_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 75000
    experience = "Mid_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_DC_75_mid = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_DC_75_mid

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,Unisys,"Reston, VA",75000.0,Data_Analyst,Mid_level
1,Salient CRGT,"Washington, DC",75000.0,Data_Analyst,Mid_level
2,Aderas,"Reston, VA 20191",75000.0,Data_Analyst,Mid_level
3,The World Bank,"Washington, DC",75000.0,Data_Analyst,Mid_level
4,Compumatics Group,"New Carrollton, MD",75000.0,Data_Analyst,Mid_level
5,DIT Inc,"Washington, DC",75000.0,Data_Analyst,Mid_level
6,Inforeem,"Washington, DC",75000.0,Data_Analyst,Mid_level
7,BCT Partners,"Washington, DC",75000.0,Data_Analyst,Mid_level
8,ICF International,"Fairfax, VA",75000.0,Data_Analyst,Mid_level
9,CAVA,"Washington, DC 20009 (U Street Corridor area)",75000.0,Data_Analyst,Mid_level


## Data_Analyst_DC_75_senior

In [102]:
driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$75,000&l=Washington,+DC&radius=50&explvl=senior_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 75000
    experience = "Senior_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_DC_75_senior = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_DC_75_senior

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,Deloitte,"Arlington, VA 22209",75000.0,Data_Analyst,Senior_level
1,Salient CRGT,"Washington, DC",75000.0,Data_Analyst,Senior_level
2,General Dynamics Information Technology,"Springfield, VA",75000.0,Data_Analyst,Senior_level
3,Booz Allen Hamilton,"Washington, DC",75000.0,Data_Analyst,Senior_level
4,DataNet Systems Corporation,"Washington, DC",75000.0,Data_Analyst,Senior_level
5,"Group Benefit Services, Inc.","Cockeysville, MD 21030",75000.0,Data_Analyst,Senior_level
6,IMTAS,"Washington, DC 20535 (Downtown area)",75000.0,Data_Analyst,Senior_level
7,ANSER,"Arlington, VA 22206 (Fairlington-Shirlington a...",75000.0,Data_Analyst,Senior_level
8,General Dynamics Information Technology,"Towson, MD",75000.0,Data_Analyst,Senior_level
9,Indev,"Washington, DC",75000.0,Data_Analyst,Senior_level


## Data_Analyst_DC_85_entry

In [103]:
driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$85,000&l=Washington,+DC&radius=50&explvl=entry_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 85000
    experience = "Entry_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_DC_85_entry = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_DC_85_entry

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,Evolent Health,"Arlington, VA 22203",85000.0,Data_Analyst,Entry_level
1,Veris Group,"Washington, DC",85000.0,Data_Analyst,Entry_level
2,Datalogic Solutions Inc,"Chantilly, VA",85000.0,Data_Analyst,Entry_level
3,Booz Allen Hamilton,"Washington, DC",85000.0,Data_Analyst,Entry_level
4,FAAZ Consulting LLC,"Washington, DC",85000.0,Data_Analyst,Entry_level
5,Booz Allen Hamilton,"Alexandria, VA",85000.0,Data_Analyst,Entry_level
6,Booz Allen Hamilton,"Washington, DC",85000.0,Data_Analyst,Entry_level
7,Booz Allen Hamilton,"Washington, DC",85000.0,Data_Analyst,Entry_level
8,DISYS,"McLean, VA",85000.0,Data_Analyst,Entry_level
9,Boecore,"Arlington, VA 22201 (Lyon Village area)",85000.0,Data_Analyst,Entry_level


## Data_Analyst_DC_85_mid 

In [104]:
driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$85,000&l=Washington,+DC&radius=50&explvl=mid_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 85000
    experience = "Mid_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_DC_85_mid = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_DC_85_mid

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,OmniSolve Inc,"Washington, DC",85000.0,Data_Analyst,Mid_level
1,Aderas,"Reston, VA 20191",85000.0,Data_Analyst,Mid_level
2,"FTI Consulting, Inc.","McLean, VA",85000.0,Data_Analyst,Mid_level
3,The World Bank,"Washington, DC",85000.0,Data_Analyst,Mid_level
4,Compumatics Group,"New Carrollton, MD",85000.0,Data_Analyst,Mid_level
5,Booz Allen Hamilton,"Quantico, VA",85000.0,Data_Analyst,Mid_level
6,Inforeem,"Washington, DC",85000.0,Data_Analyst,Mid_level
7,Booz Allen Hamilton,"Washington, DC",85000.0,Data_Analyst,Mid_level
8,Legal Services Corporation,"Washington, DC",85000.0,Data_Analyst,Mid_level
9,Excella Consulting,"Washington, DC 20006 (Foggy Bottom area)",85000.0,Data_Analyst,Mid_level


## Data_Analyst_DC_85_senior

In [105]:
driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$85,000&l=Washington,+DC&radius=50&explvl=senior_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 85000
    experience = "Senior_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_DC_85_senior = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_DC_85_senior

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,Wyle Laboratories,"Anacostia, DC",85000.0,Data_Analyst,Senior_level
1,General Dynamics Information Technology,"Washington, DC",85000.0,Data_Analyst,Senior_level
2,General Dynamics Information Technology,"Suitland, MD",85000.0,Data_Analyst,Senior_level
3,Booz Allen Hamilton,"Washington, DC",85000.0,Data_Analyst,Senior_level
4,DataNet Systems Corporation,"Washington, DC",85000.0,Data_Analyst,Senior_level
5,"Universal Strategy Group, Inc (USGI)","Washington, DC 20535 (Downtown area)",85000.0,Data_Analyst,Senior_level
6,SC3,"Washington, DC",85000.0,Data_Analyst,Senior_level
7,ANSER,"Arlington, VA 22206 (Fairlington-Shirlington a...",85000.0,Data_Analyst,Senior_level
8,Indev,"Washington, DC",85000.0,Data_Analyst,Senior_level
9,IMTAS,"Washington, DC 20535 (Downtown area)",85000.0,Data_Analyst,Senior_level


## Data_Analyst_DC_100_entry

In [106]:
driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$100,000&l=Washington,+DC&radius=50&explvl=entry_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 100000
    experience = "Entry_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_DC_100_entry = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_DC_100_entry

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,Datalogic Solutions Inc,"Chantilly, VA",100000.0,Data_Analyst,Entry_level
1,Engility Corporation,"Washington, DC 20376",100000.0,Data_Analyst,Entry_level
2,Deloitte,"Arlington, VA 22209",100000.0,Data_Analyst,Entry_level
3,Booz Allen Hamilton,"Washington, DC",100000.0,Data_Analyst,Entry_level
4,FAAZ Consulting LLC,"Washington, DC",100000.0,Data_Analyst,Entry_level
5,Booz Allen Hamilton,"Alexandria, VA",100000.0,Data_Analyst,Entry_level
6,DISYS,"McLean, VA",100000.0,Data_Analyst,Entry_level
7,Insequence Inc.,"Herndon, VA 20171",100000.0,Data_Analyst,Entry_level
8,Booz Allen Hamilton,"Washington, DC",100000.0,Data_Analyst,Entry_level
9,Vox Media,"Washington, DC",100000.0,Data_Analyst,Entry_level


## Data_Analyst_DC_100_mid

In [107]:
driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$100,000&l=Washington,+DC&radius=50&explvl=mid_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 100000
    experience = "Mid_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_DC_100_mid = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_DC_100_mid

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,OmniSolve Inc,"Washington, DC",100000.0,Data_Analyst,Mid_level
1,RCM Solutions,"Washington, DC",100000.0,Data_Analyst,Mid_level
2,All Things Considered Staffing (ATC-S),"Washington, DC",100000.0,Data_Analyst,Mid_level
3,The World Bank,"Washington, DC",100000.0,Data_Analyst,Mid_level
4,Compumatics Group,"New Carrollton, MD",100000.0,Data_Analyst,Mid_level
5,Booz Allen Hamilton,"Washington, DC",100000.0,Data_Analyst,Mid_level
6,Excella Consulting,"Washington, DC 20006 (Foggy Bottom area)",100000.0,Data_Analyst,Mid_level
7,Booz Allen Hamilton,"College Park, MD 20740",100000.0,Data_Analyst,Mid_level
8,Engility Corporation,"Washington, DC 20376 (Brentwood area)",100000.0,Data_Analyst,Mid_level
9,The World Bank,"Washington, DC",100000.0,Data_Analyst,Mid_level


## Data_Analyst_DC_100_senior

In [108]:
driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$100,000&l=Washington,+DC&radius=50&explvl=senior_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 100000
    experience = "Senior_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_DC_100_senior = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_DC_100_senior

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,General Dynamics Information Technology,"Springfield, VA",100000.0,Data_Analyst,Senior_level
1,WSSC,"Laurel, MD 20707",100000.0,Data_Analyst,Senior_level
2,Engility Corporation,"Washington, DC",100000.0,Data_Analyst,Senior_level
3,Booz Allen Hamilton,"Washington, DC",100000.0,Data_Analyst,Senior_level
4,DataNet Systems Corporation,"Washington, DC",100000.0,Data_Analyst,Senior_level
5,Indev,"Washington, DC",100000.0,Data_Analyst,Senior_level
6,Vencore,"McLean, VA",100000.0,Data_Analyst,Senior_level
7,Garud Technology Services Inc.,"Washington, DC",100000.0,Data_Analyst,Senior_level
8,Booz Allen Hamilton,"Washington, DC",100000.0,Data_Analyst,Senior_level
9,IMTAS,"Washington, DC 20001 (Shaw area)",100000.0,Data_Analyst,Senior_level


## New york city

In [109]:
## Data_Analyst_NY_55_entry

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$55,000&l=New+York,+NY&radius=50&explvl=entry_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 55000
    experience = "Entry_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_NY_55_entry = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_NY_55_entry

## Data_Analyst_NY_55_mid

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$55,000&l=New+York,+NY&radius=50&explvl=mid_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 55000
    experience = "Mid_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_NY_55_mid = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_NY_55_mid

## Data_Analyst_NY_55_senior

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$55,000&l=New+York,+NY&radius=50&explvl=senior_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 55000
    experience = "Senior_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_NY_55_senior = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_NY_55_senior

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,TD Ameritrade,"Jersey City, NJ",55000.0,Data_Analyst,Senior_level
1,Accenture,"New York, NY 10011",55000.0,Data_Analyst,Senior_level
2,Crystal Run Healthcare,"Middletown, NY 10941",55000.0,Data_Analyst,Senior_level
3,Veterans Sourcing Group,"Brooklyn, NY 11217 (Boerum Hill area)",55000.0,Data_Analyst,Senior_level
4,Grubhub,"New York, NY",55000.0,Data_Analyst,Senior_level
5,MassMutual Financial Group,"New York, NY",55000.0,Data_Analyst,Senior_level
6,American Express,"New York, NY",55000.0,Data_Analyst,Senior_level
7,MAYORS OFFICE OF CONTRACT SVCS,"Manhattan, NY",55000.0,Data_Analyst,Senior_level
8,FAIR Health,"New York, NY 10017 (Midtown area)",55000.0,Data_Analyst,Senior_level
9,Dow Jones,"New York, NY 10176 (Murray Hill area)",55000.0,Data_Analyst,Senior_level


In [110]:
##Data_Analyst_NY_70_entry

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$70,000&l=New+York,+NY&radius=50&explvl=entry_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 70000
    experience = "Entry_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_NY_70_entry = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_NY_70_entry

## Data_Analyst_NY_70_mid

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$70,000&l=New+York,+NY&radius=50&explvl=mid_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 70000
    experience = "Mid_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_NY_70_mid = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_NY_70_mid

## Data_Analyst_NY_70_senior

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$70,000&l=New+York,+NY&radius=50&explvl=senior_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 70000
    experience = "Senior_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_NY_70_senior = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_NY_70_senior


Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,TD Ameritrade,"Jersey City, NJ",70000.0,Data_Analyst,Senior_level
1,Prudential,"Newark, NJ 07101",70000.0,Data_Analyst,Senior_level
2,TD Bank,"Mount Laurel, NJ",70000.0,Data_Analyst,Senior_level
3,Veterans Sourcing Group,"Brooklyn, NY 11217 (Boerum Hill area)",70000.0,Data_Analyst,Senior_level
4,Grubhub,"New York, NY",70000.0,Data_Analyst,Senior_level
5,MassMutual Financial Group,"New York, NY",70000.0,Data_Analyst,Senior_level
6,American Express,"New York, NY",70000.0,Data_Analyst,Senior_level
7,MAYORS OFFICE OF CONTRACT SVCS,"Manhattan, NY",70000.0,Data_Analyst,Senior_level
8,FAIR Health,"New York, NY 10017 (Midtown area)",70000.0,Data_Analyst,Senior_level
9,Razorfish,"New York, NY",70000.0,Data_Analyst,Senior_level


In [111]:
##Data_Analyst_NY_85_entry

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$85,000&l=New+York,+NY&radius=50&explvl=entry_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 85000
    experience = "Entry_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_NY_85_entry = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_NY_85_entry

##Data_Analyst_NY_85_mid


driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$85,000&l=New+York,+NY&radius=50&explvl=mid_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 85000
    experience = "Mid_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_NY_85_mid = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_NY_85_mid

##Data_Analyst_NY_85_senior

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$85,000&l=New+York,+NY&radius=50&explvl=senior_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 85000
    experience = "Senior_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_NY_85_senior = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_NY_85_senior


Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,Crystal Run Healthcare,"Middletown, NY 10941",85000.0,Data_Analyst,Senior_level
1,Accenture,"New York, NY 10011",85000.0,Data_Analyst,Senior_level
2,Weill Cornell Medical College,"New York, NY",85000.0,Data_Analyst,Senior_level
3,Grubhub,"New York, NY",85000.0,Data_Analyst,Senior_level
4,American Express,"New York, NY",85000.0,Data_Analyst,Senior_level
5,Veterans Sourcing Group,"Berkeley Heights, NJ 07922",85000.0,Data_Analyst,Senior_level
6,MassMutual Financial Group,"New York, NY",85000.0,Data_Analyst,Senior_level
7,FAIR Health,"New York, NY 10017 (Midtown area)",85000.0,Data_Analyst,Senior_level
8,Sapient Global Markets,"New York, NY",85000.0,Data_Analyst,Senior_level
9,Undertone,"New York, NY 10173 (Midtown area)",85000.0,Data_Analyst,Senior_level


In [112]:
##Data_Analyst_NY_95_entry

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$95,000&l=New+York,+NY&radius=50&explvl=entry_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 95000
    experience = "Entry_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_NY_95_entry = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_NY_95_entry

##Data_Analyst_NY_125_mid


driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$95,000&l=New+York,+NY&radius=50&explvl=mid_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 95000
    experience = "Mid_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_NY_95_mid = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_NY_95_mid

##Data_Analyst_NY_95_senior

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$95,000&l=New+York,+NY&radius=50&explvl=senior_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 95000
    experience = "Senior_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_NY_95_senior = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_NY_95_senior



Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,Prudential,"Newark, NJ 07102",95000.0,Data_Analyst,Senior_level
1,Crystal Run Healthcare,"Middletown, NY 10941",95000.0,Data_Analyst,Senior_level
2,Citi,"Long Island City, NY",95000.0,Data_Analyst,Senior_level
3,MassMutual Financial Group,"New York, NY",95000.0,Data_Analyst,Senior_level
4,FAIR Health,"New York, NY 10017 (Midtown area)",95000.0,Data_Analyst,Senior_level
5,Veterans Sourcing Group,"Berkeley Heights, NJ 07922",95000.0,Data_Analyst,Senior_level
6,Grubhub,"New York, NY",95000.0,Data_Analyst,Senior_level
7,Major League Baseball Advanced Media,"New York, NY",95000.0,Data_Analyst,Senior_level
8,Sapient Global Markets,"New York, NY",95000.0,Data_Analyst,Senior_level
9,WeWork,"New York, NY 10005 (Financial District area)",95000.0,Data_Analyst,Senior_level


In [113]:
## Data_Analyst_NY_115_entry


driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$115,000&l=New+York,+NY&radius=50&explvl=entry_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 115000
    experience = "Entry_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_NY_115_entry = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_NY_115_entry

##Data_Analyst_NY_115_mid


driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$115,000&l=New+York,+NY&radius=50&explvl=mid_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 115000
    experience = "Mid_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_NY_115_mid = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_NY_115_mid

##Data_Analyst_NY_115_senior

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$115,000&l=New+York,+NY&radius=50&explvl=senior_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 115000
    experience = "Senior_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_NY_115_senior = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_NY_115_senior



Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,Prudential,"Newark, NJ 07102",115000.0,Data_Analyst,Senior_level
1,Citi,"Long Island City, NY",115000.0,Data_Analyst,Senior_level
2,Princeton Consulting,"New York, NY 10018",115000.0,Data_Analyst,Senior_level
3,Grubhub,"New York, NY",115000.0,Data_Analyst,Senior_level
4,Sapient Global Markets,"New York, NY",115000.0,Data_Analyst,Senior_level
5,Major League Baseball Advanced Media,"New York, NY",115000.0,Data_Analyst,Senior_level
6,Capital One,"New York, NY",115000.0,Data_Analyst,Senior_level
7,Paperless Post,"New York, NY",115000.0,Data_Analyst,Senior_level
8,JPMorgan Chase,"New York, NY",115000.0,Data_Analyst,Senior_level
9,WeWork,"New York, NY 10005 (Financial District area)",115000.0,Data_Analyst,Senior_level


## San francisco

In [114]:
## Data_Analyst_SF_60_entry

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$60,000&l=San+Francisco,+CA&radius=50&explvl=entry_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 60000
    experience = "Entry_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_SF_60_entry = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_SF_60_entry

## Data_Analyst_SF_60_mid

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$60,000&l=San+Francisco,+CA&radius=50&explvl=mid_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 60000
    experience = "Mid_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_SF_60_mid = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_SF_60_mid

## Data_Analyst_SF_60_senior

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$60,000&l=San+Francisco,+CA&radius=50&explvl=senior_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 60000
    experience = "Senior_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_SF_60_senior = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_SF_60_senior

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,Freedom Financial Network,"San Mateo, CA 94402",60000.0,Data_Analyst,Senior_level
1,Stanford Health Care,"Palo Alto, CA 94305",60000.0,Data_Analyst,Senior_level
2,iRhythm,"San Francisco, CA",60000.0,Data_Analyst,Senior_level
3,Genentech,"South San Francisco, CA",60000.0,Data_Analyst,Senior_level
4,iSoft Solutions,"San Francisco, CA",60000.0,Data_Analyst,Senior_level
5,Athleta,"San Francisco, CA",60000.0,Data_Analyst,Senior_level
6,"Cisco Systems, Inc.","San Jose, CA",60000.0,Data_Analyst,Senior_level
7,Intuit,"Mountain View, CA 94039",60000.0,Data_Analyst,Senior_level
8,Nuna,"San Francisco, CA",60000.0,Data_Analyst,Senior_level
9,Google,"Mountain View, CA",60000.0,Data_Analyst,Senior_level


In [115]:
## Data_Analyst_sf_75_entry

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$75,000&l=San+Francisco,+CA&radius=50&explvl=entry_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 75000
    experience = "Entry_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_SF_75_entry = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_SF_75_entry

## Data_Analyst_SF_75_mid

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$75,000&l=San+Francisco,+CA&radius=50&explvl=mid_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 75000
    experience = "Mid_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_SF_75_mid = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_SF_75_mid

## Data_Analyst_SF_75_senior

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$75,000&l=San+Francisco,+CA&radius=50&explvl=senior_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 75000
    experience = "Senior_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_SF_75_senior = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_SF_75_senior

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,Freedom Financial Network,"San Mateo, CA 94402",75000.0,Data_Analyst,Senior_level
1,Stanford Health Care,"Palo Alto, CA 94305",75000.0,Data_Analyst,Senior_level
2,Genentech,"South San Francisco, CA",75000.0,Data_Analyst,Senior_level
3,iSoft Solutions,"San Francisco, CA",75000.0,Data_Analyst,Senior_level
4,Genentech,"South San Francisco, CA",75000.0,Data_Analyst,Senior_level
5,"Cisco Systems, Inc.","San Jose, CA",75000.0,Data_Analyst,Senior_level
6,Athleta,"San Francisco, CA",75000.0,Data_Analyst,Senior_level
7,Intuit,"Mountain View, CA 94039",75000.0,Data_Analyst,Senior_level
8,Nuna,"San Francisco, CA",75000.0,Data_Analyst,Senior_level
9,Google,"Mountain View, CA",75000.0,Data_Analyst,Senior_level


In [116]:
## Data_Analyst_SF_85_entry

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$85,000&l=San+Francisco,+CA&radius=50&explvl=entry_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 85000
    experience = "Entry_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_SF_85_entry = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_SF_85_entry

## Data_Analyst_SF_85_mid

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$85,000&l=San+Francisco,+CA&radius=50&explvl=mid_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 85000
    experience = "Mid_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_SF_85_mid = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_SF_85_mid

## Data_Analyst_SF_85_senior

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$85,000&l=San+Francisco,+CA&radius=50&explvl=senior_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 85000
    experience = "Senior_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_SF_85_senior = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_SF_85_senior

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,Freedom Financial Network,"San Mateo, CA 94402",85000.0,Data_Analyst,Senior_level
1,Genentech,"South San Francisco, CA",85000.0,Data_Analyst,Senior_level
2,intuit,"Mountain View, CA 94043",85000.0,Data_Analyst,Senior_level
3,iSoft Solutions,"San Francisco, CA",85000.0,Data_Analyst,Senior_level
4,Genentech,"South San Francisco, CA",85000.0,Data_Analyst,Senior_level
5,"Cisco Systems, Inc.","San Jose, CA",85000.0,Data_Analyst,Senior_level
6,Athleta,"San Francisco, CA",85000.0,Data_Analyst,Senior_level
7,Intuit,"Mountain View, CA 94039",85000.0,Data_Analyst,Senior_level
8,Nuna,"San Francisco, CA",85000.0,Data_Analyst,Senior_level
9,Google,"Mountain View, CA",85000.0,Data_Analyst,Senior_level


In [117]:
## Data_Analyst_SF_100_entry

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$100,000&l=San+Francisco,+CA&radius=50&explvl=entry_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 100000
    experience = "Entry_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_SF_100_entry = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_SF_100_entry

## Data_Analyst_SF_100_mid

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$100,000&l=San+Francisco,+CA&radius=50&explvl=mid_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 100000
    experience = "Mid_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_SF_100_mid = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_SF_100_mid

## Data_Analyst_SF_100_senior

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$100,000&l=San+Francisco,+CA&radius=50&explvl=senior_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 100000
    experience = "Senior_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_SF_100_senior = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_SF_100_senior

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,Freedom Financial Network,"San Mateo, CA 94402",100000.0,Data_Analyst,Senior_level
1,Genentech,"South San Francisco, CA",100000.0,Data_Analyst,Senior_level
2,ServiceNow,"San Francisco, CA",100000.0,Data_Analyst,Senior_level
3,iSoft Solutions,"San Francisco, CA",100000.0,Data_Analyst,Senior_level
4,"Cisco Systems, Inc.","San Jose, CA",100000.0,Data_Analyst,Senior_level
5,Symantec,"Mountain View, CA",100000.0,Data_Analyst,Senior_level
6,Intuit,"Mountain View, CA 94039",100000.0,Data_Analyst,Senior_level
7,Nuna,"San Francisco, CA",100000.0,Data_Analyst,Senior_level
8,Fitbit,"San Francisco, CA 94105 (Financial District area)",100000.0,Data_Analyst,Senior_level
9,Google,"Mountain View, CA",100000.0,Data_Analyst,Senior_level


In [118]:
## Data_Analyst_SF_115_entry

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$115,000&l=San+Francisco,+CA&radius=50&explvl=entry_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 115000
    experience = "Entry_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_SF_115_entry = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_SF_115_entry

## Data_Analyst_SF_115_mid

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$115,000&l=San+Francisco,+CA&radius=50&explvl=mid_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 115000
    experience = "Mid_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_SF_115_mid = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_SF_115_mid

## Data_Analyst_SF_115_senior

driver = webdriver.Chrome()
url = "https://www.indeed.com/jobs?q=data+analyst+$115,000&l=San+Francisco,+CA&radius=50&explvl=senior_level&limit=100"
driver.get(url)
# wait one second
sleep(1)
#grab the page source
html = driver.page_source
# BeautifulSoup it!
html = BeautifulSoup(html)
driver.close()

# I'm going to create my empty df first
import pandas as pd
dcarea_job1 = pd.DataFrame(columns=["Company_name"])
dcarea_job2 = pd.DataFrame(columns=["Location","Salary", "Title","Experience"])
# loop through each entry
for company in html.find_all('span', {'class':'company'}):
    name_comp = company.text
    dcarea_job1.loc[len(dcarea_job1)]=[name_comp]
    
for location in html.find_all('span', {'class':'location'}):
    loc_comp = location.text
    salary = 115000
    experience = "Senior_level"
    # try to find the number of bookings
    title = "Data_Analyst"
    dcarea_job2.loc[len(dcarea_job2)]=[loc_comp,salary,title,experience]

dcarea_job1['Company_name'] = dcarea_job1['Company_name'].str.replace('\n', '')
dcarea_job2['Location'] = dcarea_job2['Location'].str.replace('\n', '')

result_SF_115_senior = pd.concat([dcarea_job1, dcarea_job2], axis=1)

result_SF_115_senior

Unnamed: 0,Company_name,Location,Salary,Title,Experience
0,Freedom Financial Network,"San Mateo, CA 94402",115000.0,Data_Analyst,Senior_level
1,BayOne Solutions,"San Francisco, CA 94107",115000.0,Data_Analyst,Senior_level
2,"Controller's Group, Inc.","Mountain View, CA",115000.0,Data_Analyst,Senior_level
3,iSoft Solutions,"San Francisco, CA",115000.0,Data_Analyst,Senior_level
4,"Cisco Systems, Inc.","San Jose, CA",115000.0,Data_Analyst,Senior_level
5,Google,"Mountain View, CA",115000.0,Data_Analyst,Senior_level
6,Hired,"San Francisco, CA",115000.0,Data_Analyst,Senior_level
7,Capital One,"San Francisco, CA",115000.0,Data_Analyst,Senior_level
8,GE Careers,"San Ramon, CA 94583",115000.0,Data_Analyst,Senior_level
9,MachineZone,"Palo Alto, CA",115000.0,Data_Analyst,Senior_level


## Data Analyst dataset

In [121]:
result_DC_Data_Analyst = pd.concat([result_DC_60_entry,result_DC_60_mid,result_DC_60_senior,result_DC_70_entry,result_DC_70_mid,result_DC_70_senior,result_DC_75_entry,result_DC_75_mid,result_DC_75_senior,result_DC_85_entry,result_DC_85_mid,result_DC_85_senior,result_DC_100_entry,result_DC_100_mid,result_DC_100_senior])

In [122]:
result_DC_Data_Analyst.shape

(1575, 5)

In [123]:
result_NY_Data_Analyst = pd.concat([result_NY_55_entry,result_NY_55_mid,result_NY_55_senior,result_NY_70_entry,result_NY_70_mid,result_NY_70_senior,result_NY_85_entry,result_NY_85_mid,result_NY_85_senior,result_NY_95_entry,result_NY_95_mid,result_NY_95_senior,result_NY_115_entry,result_NY_115_mid,result_NY_115_senior])

In [126]:
result_NY_Data_Analyst.shape

(1575, 5)

In [127]:
result_SF_Data_Analyst = pd.concat([result_SF_60_entry,result_SF_60_mid,result_SF_60_senior,result_SF_75_entry,result_SF_75_mid,result_SF_75_senior,result_SF_85_entry,result_SF_85_mid,result_SF_85_senior,result_SF_100_entry,result_SF_100_mid,result_SF_100_senior,result_SF_115_entry,result_SF_115_mid,result_SF_115_senior])

In [128]:
result_SF_Data_Analyst.shape

(1575, 5)

In [130]:
result_SF_Data_Analyst.to_csv('result_SF_Data_Analyst.csv', encoding='utf-8')


In [131]:
result_NY_Data_Analyst.to_csv('result_NY_Data_Analyst.csv', encoding='utf-8')


In [132]:
result_DC_Data_Analyst.to_csv('result_DC_Data_Analyst.csv', encoding='utf-8')


In [133]:
result_Data_Analyst = pd.concat([result_DC_Data_Analyst,result_NY_Data_Analyst,result_SF_Data_Analyst])

In [134]:
result_Data_Analyst.shape

(4725, 5)

In [135]:
result_Data_Analyst.to_csv('result_Data_Analyst.csv', encoding='utf-8')