# Stack Overflow Webscraping Project

Description here

In [1]:
# Import libraries and modules
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup

## Webscraping Stack Overflow

In [2]:
# Created an empty dataframe with the desired columns
df = pd.DataFrame(
    columns = [["company", "industry", "size", "year_founded", "status", "follower_count", "tech_stack", "benefits"]]
)

display(df)

Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits


In [3]:
# Created a function that collects spcific data points from stack overflow
def add_data_to_df(soupy):
    company_name = soupy.find("h1").text.strip() # Extract --> Company Name

    
    about_company = soupy.find_all("span", class_ = "d-block")


    indices = [12,13,14,15,16] # The indicies 12,13,14,15,and 16 represents industy, size, year founded, status, and followers
    values = []

    for index in indices: # This loop ignores IndexErrors when running the function for certain companies that are missing data
        try:
            values.append(about_company[index].text.strip())
        except IndexError:
            values.append(None) 

    industry = values[0]
    size = values[1]
    year_founded = values[2]
    status = values[3]
    followers = values[4]

    tech_skills = soupy.find_all("a", class_ = "flex--item s-tag no-tag-menu") # Searched for the company's tech stack
    tech_stack = [] # Created a list containing all relevant tech skills 

    for item in range(len(tech_skills)):
        tech_stack.append(tech_skills[item].text.strip())


    benefits_list = soupy.find_all("div", class_="flex--item pl8 pt2 fw-normal fs-body2 fc-black-700") # Searched for company benefits
    benefits = [] # Created a list containing all listed company benefits

    for item in range(len(benefits_list)):
        benefits.append(benefits_list[item].text.strip())
    
    
    # Appended all webscrapped elements into a list "new_row"

    new_row = [] 

    new_row.append(company_name)
    new_row.append(industry)
    new_row.append(size)
    new_row.append(year_founded)
    new_row.append(status)
    new_row.append(followers)
    new_row.append(tech_stack)
    new_row.append(benefits)

    df.loc[len(df.index)] = new_row # Added the list as a new row in the dataframe

In [4]:
# Accessing each link 

def scrape_page(page_link):

    company_list = page_link.find_all("a", class_="s-link", href=lambda href: href and "/jobs/companies" in href) # Accessed each <a> tag that includes href and "/jobs/companies"

    company_links = []

    for names in range(len(company_list)):
        company_links.append("https://stackoverflow.com" + company_list[names]["href"]) # The extracted links combines with "https://stackoverflow.com" generates the full link for each company's page on stack overflow


    # Created a for loop that loops through each company page at least three times
    for link in range(len(company_links)):  
        response = requests.get(company_links[link])
        max_attempts = 3 

        while max_attempts > 0:

            if response.status_code == 200:
                print("Successful connection")
                soupy = BeautifulSoup(response.text, "html")
                add_data_to_df(soupy)
                break

            elif response.status_code == 429:
                print("Response 429, reattempting...")
                time.sleep(5) # If the server is handling too many request --> wait five seconds and reattempt connection
                max_attempts -=1

            elif response.status_code == 403:
                print("Access Denied")
                break
            
            else:
                print(response.status_code)
                break

In [35]:
# Created a for loop that checks the connection of each page and scrapes data from each company's profile page through accessing their individual links

connection_attempts = 3
max_pages = 7

for page in range(1, max_pages + 1):
    url = f"https://stackoverflow.com/jobs/companies?pg={page}"
     
    while connection_attempts > 0:
        response = requests.get(url)

        if response.status_code == 200: # used a similar for loop structure to the "scrape_page" function created above
            print(f"Successfully connected to page:{page}")
            soup = BeautifulSoup(response.text, "html")
            scrape_page(soup)
            break

        elif response.status_code == 429:
            print("Response [429]: Reattempting to connect")
            time.sleep(5)
            connection_attempts -= 1

        elif response.status_code == 403:
                print("Response [403]: Access denied") 
                break
        else:
            print("Invalid link")
            break
    if connection_attempts == 0:
        print("Max number of attempts excceded")

Successfully connected to page:1
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successfully connected to page:2
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successfully connected to page:3
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successfully connected to page:4
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful

In [36]:
# Show the extracted data
display(df)

Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
0,Pluralsight,SaaS,1k-5k employees,2004,VC Funded,394,"[.net, c#, c++, golang, java, javascript, kotl...","[Unlimited paid time off, Summer Fridays, Comp..."
1,Amperity,Customer Data Platform,201-500 employees,2015,Private,88,"[clojure, clojurescript, scala, node.js, apach...","[Paid Time Off, Mentorship Program, Parental L..."
2,"Jack Henry & Associates, Inc.®","Banking, Financial Technology, Software Develo...",5k-10k employees,1976,Public,270,"[scala, go, ecmascript-harmony, docker, c#, ob...","[Flexible Work Hours, Remote Work Opportunitie..."
3,DISH,"Cloud Services, Information Technology, Wirele...",10k+ employees,1980,Public,156,"[amazon-web-services, azure, devops, f5, linux...","[Diversity employee resource groups, Tuition r..."
4,Belden Inc.,Industrial Automation,5k-10k employees,1902,Public,85,"[opc-ua, node-red, network-security, c, 5g, li...","[Employee stock purchase plan, Bravo recogniti..."
...,...,...,...,...,...,...,...,...
60,ZEISS Group,"Machine Learning, Medical Devices, Virtual Rea...",10k+ employees,1846,Private,218,"[angular, swift, c#, azure, javascript, python...","[Flexible working time models, Open, dialogue-..."
61,Citi,"Finance, Financial Technology, Risk Management",10k+ employees,Public,192,,"[java, python, api, management, c#, ai, cloud-...","[Medical, Dental and Vision coverage, Paid Tim..."
62,Capital One – Mexico City,"Financial Services, Financial Technology",10k+ employees,1994,Public,,"[javascript, scala, python, java, cassandra, h...","[Health Insurance + Wellness, Vacation + Paid ..."
63,KfW,5k-10k employees,1948,Public,52,,"[java, pl1, javascript, python, sap, c#, c++, ...","[Marktgerechte Vergütung, Betriebliche Altersv..."


## Data Cleaning and Feature Engineering

In [33]:
# Reset df
"""
df = pd.DataFrame(
    columns = [["company", "industry", "size", "year_founded", "status", "follower_count", "tech_stack", "benefits"]]
)
"""

In [37]:
# Created a csv file based on the extracted data
df.to_csv("companies.csv", index = False)
df = pd.read_csv("companies.csv")
display(df)

Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
0,Pluralsight,SaaS,1k-5k employees,2004,VC Funded,394,"['.net', 'c#', 'c++', 'golang', 'java', 'javas...","['Unlimited paid time off', 'Summer Fridays', ..."
1,Amperity,Customer Data Platform,201-500 employees,2015,Private,88,"['clojure', 'clojurescript', 'scala', 'node.js...","['Paid Time Off', 'Mentorship Program', 'Paren..."
2,"Jack Henry & Associates, Inc.®","Banking, Financial Technology, Software Develo...",5k-10k employees,1976,Public,270,"['scala', 'go', 'ecmascript-harmony', 'docker'...","['Flexible Work Hours', 'Remote Work Opportuni..."
3,DISH,"Cloud Services, Information Technology, Wirele...",10k+ employees,1980,Public,156,"['amazon-web-services', 'azure', 'devops', 'f5...","['Diversity employee resource groups', 'Tuitio..."
4,Belden Inc.,Industrial Automation,5k-10k employees,1902,Public,85,"['opc-ua', 'node-red', 'network-security', 'c'...","['Employee stock purchase plan', 'Bravo recogn..."
...,...,...,...,...,...,...,...,...
60,ZEISS Group,"Machine Learning, Medical Devices, Virtual Rea...",10k+ employees,1846,Private,218,"['angular', 'swift', 'c#', 'azure', 'javascrip...","['Flexible working time models', 'Open, dialog..."
61,Citi,"Finance, Financial Technology, Risk Management",10k+ employees,Public,192,,"['java', 'python', 'api', 'management', 'c#', ...","['Medical, Dental and Vision coverage', 'Paid ..."
62,Capital One – Mexico City,"Financial Services, Financial Technology",10k+ employees,1994,Public,,"['javascript', 'scala', 'python', 'java', 'cas...","['Health Insurance + Wellness', 'Vacation + Pa..."
63,KfW,5k-10k employees,1948,Public,52,,"['java', 'pl1', 'javascript', 'python', 'sap',...","['Marktgerechte Vergütung', 'Betriebliche Alte..."


Note: Certain companies did not include certain data such as company size or year founded, thus some rows of the dataset are shifted one value to the right 

In [41]:
# Identified the companies that have misplaced values
industry_list = df["industry"].values

for item in industry_list:
    if str("employees") in str(item): 
        display(df[df["industry"] == item])


In [40]:
# Identified the companies that have misplaced values
industry_list = df["industry"].values

for item in industry_list:
    if "employees" in item: 

        index = df[df["industry"] == item].index
        
        size = df.loc[index, "industry"]
        year_founded = df.loc[index, "size"]
        status = df.loc[index, "year_founded"]
        follower_count = df.loc[index, "status"]

        df.loc[index, "industry"] = None
        df.loc[index, "size"] = size
        df.loc[index, "year_founded"] = year_founded
        df.loc[index, "status"] = status
        df.loc[index, "follower_count"] = follower_count
        
        display(df.iloc[index])
    else: 
        pass


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
42,Outfit7,,201-500 employees,2009,Private,139,"['unity-game-engine', 'c#', 'c++', 'rendering'...",['24 days of paid vacation to start & sabbatic...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
43,Synopsys Inc.,,10k+ employees,1986,Public,154,"['c', 'c++', 'c#', 'system-verilog', 'verilog'...","['Health/Dental/Vision Insurance, HSA and FSA'..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
63,KfW,,5k-10k employees,1948,Public,52,"['java', 'pl1', 'javascript', 'python', 'sap',...","['Marktgerechte Vergütung', 'Betriebliche Alte..."


In [None]:
# Checked the current data type for each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   company         64 non-null     object
 1   industry        64 non-null     object
 2   size            64 non-null     object
 3   year_founded    64 non-null     object
 4   status          64 non-null     object
 5   follower_count  51 non-null     object
 6   tech_stack      64 non-null     object
 7   benefits        64 non-null     object
dtypes: object(8)
memory usage: 4.1+ KB


In [None]:
# Set specific data types for each column in the dataset
"""
df["company"] = df["company"].astype("string")
df["industry"] = df["industry"].astype("string")
df["size"] = df["size"].

"""

'\ndf["company"] = df["company"].astype("string")\ndf["industry"] = df["industry"].astype("string")\ndf["size"] = df["size"].\n\n'

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   company         64 non-null     string
 1   industry        64 non-null     object
 2   size            64 non-null     object
 3   year_founded    64 non-null     object
 4   status          64 non-null     object
 5   follower_count  51 non-null     object
 6   tech_stack      64 non-null     object
 7   benefits        64 non-null     object
dtypes: object(7), string(1)
memory usage: 4.1+ KB


In [None]:
# Separated the "tech_stack" column into multiple columns organized by type of tech
# New Columns: "Cloud Services and Infrastructure", "Programing Languages and Frameworks", "Databases and Data Technologies"
