# Stack Overflow Webscraping Project

Description here

In [13]:
# Import libraries and modules
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Scraping each company profile page

In [14]:
# Created an empty dataframe with the desired columns
df = pd.DataFrame(
    columns = [["company", "industry", "size", "year_founded", "status", "follower_count", "tech_stack", "benefits"]]
)

display(df)

Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits


In [15]:
# 
def add_data_to_df(soupy):
    company_name = soupy.find("h1").text.strip() # Company Name

    
    about_company = soupy.find_all("span", class_ = "d-block")


    indices = [12,13,14,15,16] # The indicies 12,13,14,15,and 16 represent industy, size, year founded, status, and followers
    values = []

    for index in indices:
        try:
            values.append(about_company[index].text.strip())
        except IndexError:
            values.append("Null")


    industry = values[0]
    size = values[1]
    year_founded = values[2]
    status = values[3]
    followers = values[4]

    tech_skills = soupy.find_all("a", class_ = "flex--item s-tag no-tag-menu") # Searched for the company's tech stack
    tech_stack = [] # Created a list containing all relevant tech skills 

    for item in range(len(tech_skills)):
        tech_stack.append(tech_skills[item].text.strip())


    benefits_list = soupy.find_all("div", class_="flex--item pl8 pt2 fw-normal fs-body2 fc-black-700") # Searched for company benefits
    benefits = [] # Created a list containing all listed company benefits

    for item in range(len(benefits_list)):
        benefits.append(benefits_list[item].text.strip())
    
    
    # Appended all webscrapped elements into a list "new_row"

    new_row = [] 

    new_row.append(company_name)
    new_row.append(industry)
    new_row.append(size)
    new_row.append(year_founded)
    new_row.append(status)
    new_row.append(followers)
    new_row.append(tech_stack)
    new_row.append(benefits)

    df.loc[len(df.index)] = new_row # Added the list as a new row in the dataframe

In [16]:
# Accessing each link 

def scrape_page(page_link):

    company_list = page_link.find_all("a", class_="s-link", href=lambda href: href and "/jobs/companies" in href)

    company_links = []

    for names in range(len(company_list)):
        company_links.append("https://stackoverflow.com" + company_list[names]["href"])



    for link in range(len(company_links)):  # currently testing -> final code would have range(len(company_links))
        response = requests.get(company_links[link])
        max_attempts = 3

        while max_attempts > 0:

            if response.status_code == 200:
                print("Successful connection")
                soupy = BeautifulSoup(response.text, "html")
                add_data_to_df(soupy)
                break

            elif response.status_code == 429:
                print("Response 429, reattempting...")
                max_attempts -=1

            elif response.status_code == 403:
                print("Access Denied")
                break
            
            else:
                print(response.status_code)
                break

In [17]:
# Scraping Multiple Pages

connection_attempts = 3
max_pages = 7

for page in range(1, max_pages + 1):
    url = f"https://stackoverflow.com/jobs/companies?pg={page}"
     
    while connection_attempts > 0:
        response = requests.get(url)

        if response.status_code == 200:
            print(f"Successfully connected to page:{page}")
            soup = BeautifulSoup(response.text, "html")
            scrape_page(soup)
            break

        elif response.status_code == 429:
            print("Response [429]: Reattempting to connect")
            connection_attempts -= 1

        elif response.status_code == 403:
                print("Response [403]: Access denied") 
                break
        else:
            print("Invalid link")
            break
    if connection_attempts == 0:
        print("Max number of attempts excceded")

Successfully connected to page:1
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successfully connected to page:2
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successfully connected to page:3
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successfully connected to page:4
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful

In [18]:
display(df)

Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
0,Paylocity,"Human Resources, Payroll, Software Development...",5k-10k employees,1997,Public,143,"[aws, api-gateway, .net, c#, sql-server, react...",[100% Remote Opportunities in Product & Tech T...
1,GFT,"Banking, Cloud-Based Solutions, Financial Tech...",1k-5k employees,1987,Private,160,"[java, angular, .net, react, sql, python, rest...",[Worksmile - a cafeteria system. Monthly accou...
2,Zoi TechCon GmbH,"Cloud Computing, Cloud Services, IT Consulting",201-500 employees,2017,Private,91,"[aws, azure, google-cloud-platform, react, .ne...",[Germany job ticket (payment of the total amou...
3,Intuit,"Computer Software, Financial Technology",10k+ employees,1983,Public,642,"[java, kotlin, scala, play, spark, react, aws,...","[Well-being for Life Reimbursement Program, Em..."
4,Discover Financial Svcs,"Banking, Financial Services",10k+ employees,1986,Public,151,"[javascript, tableau-api, powerbi, qlikview, t...","[Annual Leave - 5 to 7 weeks, Tuition Reimburs..."
...,...,...,...,...,...,...,...,...
59,UBS,"Financial Services, Financial Technology",10k+ employees,Private,102,Null,"[javascript, java, kotlin, kubernetes, cicd, p...",[Working with cutting edge tech and agile prac...
60,ZEISS Group,"Machine Learning, Medical Devices, Virtual Rea...",10k+ employees,1846,Private,218,"[angular, swift, c#, azure, javascript, python...","[Flexible working time models, Open, dialogue-..."
61,Citi,"Finance, Financial Technology, Risk Management",10k+ employees,Public,192,Null,"[java, python, api, management, c#, ai, cloud-...","[Medical, Dental and Vision coverage, Paid Tim..."
62,KfW,5k-10k employees,1948,Public,52,Null,"[java, pl1, javascript, python, sap, c#, c++, ...","[Marktgerechte Vergütung, Betriebliche Altersv..."


## Data Cleaning and Feature Engineering

In [37]:
# Created a csv file based on the extracted data
df.to_csv("companies.csv", index = False)

In [38]:
df = pd.read_csv("companies.csv")
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
0,0,0,Paylocity,"Human Resources, Payroll, Software Development...",5k-10k employees,1997,Public,143,"['aws', 'api-gateway', '.net', 'c#', 'sql-serv...",['100% Remote Opportunities in Product & Tech ...
1,1,1,GFT,"Banking, Cloud-Based Solutions, Financial Tech...",1k-5k employees,1987,Private,160,"['java', 'angular', '.net', 'react', 'sql', 'p...",['Worksmile - a cafeteria system. Monthly acco...
2,2,2,Zoi TechCon GmbH,"Cloud Computing, Cloud Services, IT Consulting",201-500 employees,2017,Private,91,"['aws', 'azure', 'google-cloud-platform', 'rea...",['Germany job ticket (payment of the total amo...
3,3,3,Intuit,"Computer Software, Financial Technology",10k+ employees,1983,Public,642,"['java', 'kotlin', 'scala', 'play', 'spark', '...","['Well-being for Life Reimbursement Program', ..."
4,4,4,Discover Financial Svcs,"Banking, Financial Services",10k+ employees,1986,Public,151,"['javascript', 'tableau-api', 'powerbi', 'qlik...","['Annual Leave - 5 to 7 weeks', 'Tuition Reimb..."
...,...,...,...,...,...,...,...,...,...,...
59,59,59,UBS,"Financial Services, Financial Technology",10k+ employees,Private,102,Null,"['javascript', 'java', 'kotlin', 'kubernetes',...",['Working with cutting edge tech and agile pra...
60,60,60,ZEISS Group,"Machine Learning, Medical Devices, Virtual Rea...",10k+ employees,1846,Private,218,"['angular', 'swift', 'c#', 'azure', 'javascrip...","['Flexible working time models', 'Open, dialog..."
61,61,61,Citi,"Finance, Financial Technology, Risk Management",10k+ employees,Public,192,Null,"['java', 'python', 'api', 'management', 'c#', ...","['Medical, Dental and Vision coverage', 'Paid ..."
62,62,62,KfW,5k-10k employees,1948,Public,52,Null,"['java', 'pl1', 'javascript', 'python', 'sap',...","['Marktgerechte Vergütung', 'Betriebliche Alte..."
