# Stack Overflow Webscraping Project

Description here

In [122]:
# Import libraries and modules
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup

## Webscraping Stack Overflow

In [123]:
# Created an empty dataframe with the desired columns
df = pd.DataFrame(
    columns = [["company", "industry", "size", "year_founded", "status", "follower_count", "tech_stack", "benefits"]]
)

display(df)

Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits


In [124]:
# Created a function that collects spcific data points from stack overflow
def add_data_to_df(soupy):
    company_name = soupy.find("h1").text.strip() # Extract --> Company Name

    
    about_company = soupy.find_all("span", class_ = "d-block")


    indices = [12,13,14,15,16] # The indicies 12,13,14,15,and 16 represents industy, size, year founded, status, and followers
    values = []

    for index in indices: # This loop ignores IndexErrors when running the function for certain companies that are missing data
        try:
            values.append(about_company[index].text.strip())
        except IndexError:
            values.append(None) 

    industry = values[0]
    size = values[1]
    year_founded = values[2]
    status = values[3]
    followers = values[4]

    tech_skills = soupy.find_all("a", class_ = "flex--item s-tag no-tag-menu") # Searched for the company's tech stack
    tech_stack = [] # Created a list containing all relevant tech skills 

    for item in range(len(tech_skills)):
        tech_stack.append(tech_skills[item].text.strip())


    benefits_list = soupy.find_all("div", class_="flex--item pl8 pt2 fw-normal fs-body2 fc-black-700") # Searched for company benefits
    benefits = [] # Created a list containing all listed company benefits

    for item in range(len(benefits_list)):
        benefits.append(benefits_list[item].text.strip())
    
    
    # Appended all webscrapped elements into a list "new_row"

    new_row = [] 

    new_row.append(company_name)
    new_row.append(industry)
    new_row.append(size)
    new_row.append(year_founded)
    new_row.append(status)
    new_row.append(followers)
    new_row.append(tech_stack)
    new_row.append(benefits)

    df.loc[len(df.index)] = new_row # Added the list as a new row in the dataframe

In [125]:
# Accessing each link 

def scrape_page(page_link):

    company_list = page_link.find_all("a", class_="s-link", href=lambda href: href and "/jobs/companies" in href) # Accessed each <a> tag that includes href and "/jobs/companies"

    company_links = []

    for names in range(len(company_list)):
        company_links.append("https://stackoverflow.com" + company_list[names]["href"]) # The extracted links combines with "https://stackoverflow.com" generates the full link for each company's page on stack overflow


    # Created a for loop that loops through each company page at least three times
    for link in range(len(company_links)):  
        response = requests.get(company_links[link])
        max_attempts = 3 

        while max_attempts > 0:

            if response.status_code == 200:
                print("Successful connection")
                soupy = BeautifulSoup(response.text, "html")
                add_data_to_df(soupy)
                break

            elif response.status_code == 429:
                print("Response 429, reattempting...")
                time.sleep(5) # If the server is handling too many request --> wait five seconds and reattempt connection
                max_attempts -=1

            elif response.status_code == 403:
                print("Access Denied")
                break
            
            else:
                print(response.status_code)
                break

In [126]:
# Created a for loop that checks the connection of each page and scrapes data from each company's profile page through accessing their individual links

connection_attempts = 3
max_pages = 7

for page in range(1, max_pages + 1):
    url = f"https://stackoverflow.com/jobs/companies?pg={page}"
     
    while connection_attempts > 0:
        response = requests.get(url)

        if response.status_code == 200: # used a similar for loop structure to the "scrape_page" function created above
            print(f"Successfully connected to page:{page}")
            soup = BeautifulSoup(response.text, "html")
            scrape_page(soup)
            break

        elif response.status_code == 429:
            print("Response [429]: Reattempting to connect")
            time.sleep(5)
            connection_attempts -= 1

        elif response.status_code == 403:
                print("Response [403]: Access denied") 
                break
        else:
            print("Invalid link")
            break
    if connection_attempts == 0:
        print("Max number of attempts excceded")

Response [429]: Reattempting to connect
Response [429]: Reattempting to connect


KeyboardInterrupt: 

In [None]:
# Show the extracted data
display(df)

Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
0,SD Worx,"Cloud-Based Solutions, Financial Services, Pro...",5k-10k employees,1945,VC Funded,100,"[azure, azure-devops, amazon-web-services, asp...",[An attractive salary based on your experience...
1,Paylocity,"Human Resources, Payroll, Software Development...",5k-10k employees,1997,Public,148,"[aws, api-gateway, .net, c#, sql-server, react...",[100% Remote Opportunities in Product & Tech T...
2,GFT,"Banking, Cloud-Based Solutions, Financial Tech...",1k-5k employees,1987,Private,163,"[java, angular, .net, react, sql, python, rest...",[Worksmile - a cafeteria system. Monthly accou...
3,Zoi TechCon GmbH,"Cloud Computing, Cloud Services, IT Consulting",201-500 employees,2017,Private,95,"[aws, azure, google-cloud-platform, react, .ne...",[Germany job ticket (payment of the total amou...
4,Intuit,"Computer Software, Financial Technology",10k+ employees,1983,Public,650,"[java, kotlin, scala, play, spark, react, aws,...","[Well-being for Life Reimbursement Program, Em..."
5,Discover Financial Svcs,"Banking, Financial Services",10k+ employees,1986,Public,152,"[javascript, tableau-api, powerbi, qlikview, t...","[Annual Leave - 5 to 7 weeks, Tuition Reimburs..."
6,Discover Financial Services,"Banking, Financial Services, Financial Technology",10k+ employees,1985,Public,185,"[tableau-api, powerbi, qlikview, teradata, had...","[Paid Time Off - 4 to 6 weeks per year, Health..."
7,Global Industrial,"B2B Sales, eCommerce, Information Technology",1k-5k employees,1949,Public,109,"[java, jakarta-ee, spring, reactjs, next.js, j...",[Employee Stock Purchase Plan – 15% discount o...
8,ALDI SÜD IT,"E-Commerce, Information Technology, Retail",1k-5k employees,1913,Private,50,"[angular, asp.net-core, azure, bash, c#, conti...","[30 days of annual leave, Holiday and Christma..."
9,IT Sonix Custom Development GmbH,"Agile Software Development, Software Consultin...",51-200 employees,2011,Private,164,"[java, spring, boot, .net, core, c#, angular, ...","[Weiterbildung, Flexible Arbeitszeiten, Mobile..."


## Data Cleaning and Feature Engineering

In [None]:
# Created a csv file based on the extracted data
df.to_csv("companies.csv", index = False)
df = pd.read_csv("companies.csv")
display(df)

Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
0,SD Worx,"Cloud-Based Solutions, Financial Services, Pro...",5k-10k employees,1945,VC Funded,100,"['azure', 'azure-devops', 'amazon-web-services...",['An attractive salary based on your experienc...
1,Paylocity,"Human Resources, Payroll, Software Development...",5k-10k employees,1997,Public,148,"['aws', 'api-gateway', '.net', 'c#', 'sql-serv...",['100% Remote Opportunities in Product & Tech ...
2,GFT,"Banking, Cloud-Based Solutions, Financial Tech...",1k-5k employees,1987,Private,163,"['java', 'angular', '.net', 'react', 'sql', 'p...",['Worksmile - a cafeteria system. Monthly acco...
3,Zoi TechCon GmbH,"Cloud Computing, Cloud Services, IT Consulting",201-500 employees,2017,Private,95,"['aws', 'azure', 'google-cloud-platform', 'rea...",['Germany job ticket (payment of the total amo...
4,Intuit,"Computer Software, Financial Technology",10k+ employees,1983,Public,650,"['java', 'kotlin', 'scala', 'play', 'spark', '...","['Well-being for Life Reimbursement Program', ..."
5,Discover Financial Svcs,"Banking, Financial Services",10k+ employees,1986,Public,152,"['javascript', 'tableau-api', 'powerbi', 'qlik...","['Annual Leave - 5 to 7 weeks', 'Tuition Reimb..."
6,Discover Financial Services,"Banking, Financial Services, Financial Technology",10k+ employees,1985,Public,185,"['tableau-api', 'powerbi', 'qlikview', 'terada...","['Paid Time Off - 4 to 6 weeks per year', 'Hea..."
7,Global Industrial,"B2B Sales, eCommerce, Information Technology",1k-5k employees,1949,Public,109,"['java', 'jakarta-ee', 'spring', 'reactjs', 'n...",['Employee Stock Purchase Plan – 15% discount ...
8,ALDI SÜD IT,"E-Commerce, Information Technology, Retail",1k-5k employees,1913,Private,50,"['angular', 'asp.net-core', 'azure', 'bash', '...","['30 days of annual leave', 'Holiday and Chris..."
9,IT Sonix Custom Development GmbH,"Agile Software Development, Software Consultin...",51-200 employees,2011,Private,164,"['java', 'spring', 'boot', '.net', 'core', 'c#...","['Weiterbildung', 'Flexible Arbeitszeiten', 'M..."


Note: Certain companies did not include certain data such as company size or year founded, thus some rows of the dataset are shifted one value to the right 

In [None]:
# Identified the companies that have misplaced values
industry_list = df["industry"].values

for item in industry_list:
    if "employees" in item: 
        display(df[df["industry"] == item])


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
40,Outfit7,201-500 employees,2009,Private,139,,"['unity-game-engine', 'c#', 'c++', 'rendering'...",['24 days of paid vacation to start & sabbatic...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
41,Synopsys Inc.,10k+ employees,1986,Public,154,,"['c', 'c++', 'c#', 'system-verilog', 'verilog'...","['Health/Dental/Vision Insurance, HSA and FSA'..."


In [None]:
# Identified the companies that have misplaced values
industry_list = df["industry"].values

for item in industry_list:
    if "employees" in item: 

        index = df[df["industry"] == item].index
        
        size = df.loc[index, "industry"]
        year_founded = df.loc[index, "size"]
        status = df.loc[index, "year_founded"]
        follower_count = df.loc[index, "status"]

        df.loc[index, "industry"] = None
        df.loc[index, "size"] = size
        df.loc[index, "year_founded"] = year_founded
        df.loc[index, "status"] = status
        df.loc[index, "follower_count"] = follower_count
        
        display(df.iloc[[index]])
    else: 
        pass


ValueError: Buffer has wrong number of dimensions (expected 1, got 2)

In [None]:
display(df.iloc[[40]])

Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
40,Outfit7,,201-500 employees,2009,Private,139,"['unity-game-engine', 'c#', 'c++', 'rendering'...",['24 days of paid vacation to start & sabbatic...


In [None]:
industry_list = df["industry"].values

for item in industry_list:
    index = df[df["industry"] == item].index
    display(df.iloc[index])


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
0,SD Worx,"Cloud-Based Solutions, Financial Services, Pro...",5k-10k employees,1945,VC Funded,100,"['azure', 'azure-devops', 'amazon-web-services...",['An attractive salary based on your experienc...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
1,Paylocity,"Human Resources, Payroll, Software Development...",5k-10k employees,1997,Public,148,"['aws', 'api-gateway', '.net', 'c#', 'sql-serv...",['100% Remote Opportunities in Product & Tech ...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
2,GFT,"Banking, Cloud-Based Solutions, Financial Tech...",1k-5k employees,1987,Private,163,"['java', 'angular', '.net', 'react', 'sql', 'p...",['Worksmile - a cafeteria system. Monthly acco...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
3,Zoi TechCon GmbH,"Cloud Computing, Cloud Services, IT Consulting",201-500 employees,2017,Private,95,"['aws', 'azure', 'google-cloud-platform', 'rea...",['Germany job ticket (payment of the total amo...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
4,Intuit,"Computer Software, Financial Technology",10k+ employees,1983,Public,650,"['java', 'kotlin', 'scala', 'play', 'spark', '...","['Well-being for Life Reimbursement Program', ..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
5,Discover Financial Svcs,"Banking, Financial Services",10k+ employees,1986,Public,152,"['javascript', 'tableau-api', 'powerbi', 'qlik...","['Annual Leave - 5 to 7 weeks', 'Tuition Reimb..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
6,Discover Financial Services,"Banking, Financial Services, Financial Technology",10k+ employees,1985,Public,185,"['tableau-api', 'powerbi', 'qlikview', 'terada...","['Paid Time Off - 4 to 6 weeks per year', 'Hea..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
7,Global Industrial,"B2B Sales, eCommerce, Information Technology",1k-5k employees,1949,Public,109,"['java', 'jakarta-ee', 'spring', 'reactjs', 'n...",['Employee Stock Purchase Plan – 15% discount ...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
8,ALDI SÜD IT,"E-Commerce, Information Technology, Retail",1k-5k employees,1913,Private,50,"['angular', 'asp.net-core', 'azure', 'bash', '...","['30 days of annual leave', 'Holiday and Chris..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
9,IT Sonix Custom Development GmbH,"Agile Software Development, Software Consultin...",51-200 employees,2011,Private,164,"['java', 'spring', 'boot', '.net', 'core', 'c#...","['Weiterbildung', 'Flexible Arbeitszeiten', 'M..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
10,JPMorgan Chase & Co.,Financial Services,10k+ employees,1799,Public,1.1k,"['java', 'python', 'sql', 'devops', 'bigdata',...","['Healthcare and Insurance Plans', 'Wellness P..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
11,adjoe,"Ad Tech, Advertising Technology, Mobile Applic...",51-200 employees,2018,Private,365,"['ios', 'android', 'mobile', 'kotlin', 'elasti...","['Work with the newest technologies', 'Free in..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
12,EMBL-EBI (EMBL's European Bioinformatics Insti...,"Big Data, Data Science, Life Sciences",501-1k employees,1992,Public,158,"['javascript', 'java', 'angular', 'reactjs', '...","['Private Health Insurance', '30 days annual l..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
13,Sartorius,"Biotechnology, Life Sciences",10k+ employees,1870,Private,205,"['amazon-web-services', 'c', 'c#', 'c++', 'jav...",['Collaboration in international project teams...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
14,PayPay Corporation.,Financial Technology,1k-5k employees,2018,Private,694,"['java', 'spring-boot', 'amazon-web-services',...","['Work from anywhere at anytime', 'Free Access..."
38,MarketAxess,Financial Technology,501-1k employees,2000,Public,122,"['java', 'kafka', 'react', 'angular', 'javascr...",['Hybrid working: Mix of remote and in the off...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
15,Contentful,Software Development,501-1k employees,2013,VC Funded,241,"['javascript', 'ruby', 'typescript', 'swift', ...","['Equity', 'Remote or Hybrid working locations..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
16,Nord Security,"Cybersecurity, Network Security, Software Deve...",1k-5k employees,2012,Private,481,"['php', 'go', 'mysql', 'apache-kafka', 'symfon...",['The opportunity to shape a more trusted and ...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
17,Audible,"Agile Software Development, Content Marketing,...",1k-5k employees,1995,Public,523,"['java', 'git', 'jquery', 'angular.js', 'requi...","['Health, Vision, & Dental', 'Amazon Restricte..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
18,ASML,"Hardware Development, Semiconductors, Software...",10k+ employees,1984,Public,345,"['c#', 'java-ee', 'c++', 'c', 'python', 'oop',...","['Competitive salary', 'Paid vacation & other ..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
19,Thermo Fisher Scientific Careers,"Biotechnology, Pharmaceuticals, Science",10k+ employees,1956,Public,410,"['javascript', 'sql', 'python', 'artificial-in...","['Choice of national medical and dental plans,..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
20,Belden Inc.,Industrial Automation,5k-10k employees,1902,Public,85.0,"['opc-ua', 'node-red', 'network-security', 'c'...","['Employee stock purchase plan', 'Bravo recogn..."
46,Siemens AG,Industrial Automation,10k+ employees,1847,336,,"['python', 'c', 'java', 'c++', 'go', 'javascri...","['Promoting life-work balance', 'Supported wel..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
21,Volvo Group,Transportation,10k+ employees,1927,Private,373,"['reactjs', 'javascript', 'java', 'python', 'c...","['Freedom to explore, try and create', 'Dressc..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
22,"Jack Henry & Associates, Inc.®","Banking, Financial Technology, Software Develo...",5k-10k employees,1976,Public,269,"['scala', 'go', 'ecmascript-harmony', 'docker'...","['Flexible Work Hours', 'Remote Work Opportuni..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
23,"Logius, onderdeel ministerie BZK","Computer Software, Government, IT Consulting",501-1k employees,Public,44,,"['grafana', 'kibana', 'thanos', 'minio', 'harb...","['Individueel keuze budget (IKB) van 16,37% ov..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
24,National Security Agency,"Cybersecurity, Federal Agencies, Signals Analysis",10k+ employees,1952,Public,711,"['java', 'c', 'c++', 'angular', 'javascript', ...",['Work that contributes to the mission of prot...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
25,Deutsche Börse Group,"Customer Data Platform, Financial Technology, ...",10k+ employees,Private,103,,"['c++', 'java', 'python', 'google-cloud-platfo...",['HYBRID WORK – our working model combines the...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
26,"Coder Technologies, Inc.","Enterprise Software, Software Development, Sof...",51-200 employees,2017,VC Funded,47,"['go', 'open-source', 'typescript', 'reactjs',...",['Generous medical / dental / vision coverage ...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
27,Pluralsight,SaaS,1k-5k employees,2004,VC Funded,392,"['.net', 'c#', 'c++', 'golang', 'java', 'javas...","['Unlimited paid time off', 'Summer Fridays', ..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
28,Kraken,Green Energy,501-1k employees,2019,VC Funded,125,"['python', 'django', 'react', 'graphql', 'post...","['Flex Office', 'Profit sharing: Equity option..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
29,Gatekeeper,"Computer Software, Enterprise Software, Legal ...",51-200 employees,Private,42,,"['ruby-on-rails', 'javascript', 'hotwire-rails...",['Fully remote working. Our team currently spa...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
30,RELEX Solutions,"Retail Industry, Supply Chain Management Software",1k-5k employees,2005,Private,39,"['typescript', 'react', 'redux', 'jest', 'kube...","['Flexible Working', 'Health Insurance, Occupa..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
31,Statkraft,"Energy & Commodities, Green Energy",5k-10k employees,1895,Public,56,"['c++', 'c#', 'python', 'azure', 'reactjs', 't...","['Caring and supportive work-culture', 'Flat h..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
32,S&P Global,"Artificial Intelligence, Business Intelligence...",10k+ employees,1860,Public,146,"['python', 'java', '.net', 'amazon-web-service...","['Reproductive Wellness', 'Family Leave- Our p..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
33,ING Deutschland,"Banking, Finance, Financial Technology",5k-10k employees,1965,Public,226,"['java', 'java-ee', 'ios', 'android', 'git', '...",['Jährlich 500 € persönliches Budget zur indiv...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
34,MongoDB,"Computer Software, Databases, Enterprise Software",5k-10k employees,2007,Public,651,"['c++', 'c++17', 'go', 'java', 'python', 'java...","['Working models we provide: Remote, flexible ..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
35,DISH,"Cloud Services, Information Technology, Wirele...",10k+ employees,1980,Public,154,"['amazon-web-services', 'azure', 'devops', 'f5...","['Diversity employee resource groups', 'Tuitio..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
36,American Express.,"Finance, Financial Services, Financial Technology",10k+ employees,1850,Private,663,"['java', 'python', 'hadoop', 'spark', 'ai', 'm...","['Amex Flex (Hybrid Work Model)', 'Paid Time O..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
37,Arthrex GmbH,"Medical Devices, Medical Imaging, Medical Soft...",5k-10k employees,1981,Private,11,"['angular', 'javascript', '.net', 'c#', 'azure...","['International and dynamic environment', 'Fre..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
14,PayPay Corporation.,Financial Technology,1k-5k employees,2018,Private,694,"['java', 'spring-boot', 'amazon-web-services',...","['Work from anywhere at anytime', 'Free Access..."
38,MarketAxess,Financial Technology,501-1k employees,2000,Public,122,"['java', 'kafka', 'react', 'angular', 'javascr...",['Hybrid working: Mix of remote and in the off...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
39,MetLife,"Financial Services, Insurance",10k+ employees,1868,Public,128,"['azure', 'java', 'python', 'terraform', 'aem'...","['Hybrid Work Environment', 'Complete Benefits..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
41,Synopsys Inc.,10k+ employees,1986,Public,154,,"['c', 'c++', 'c#', 'system-verilog', 'verilog'...","['Health/Dental/Vision Insurance, HSA and FSA'..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
42,Roblox,"Communications, Software Development, Virtual ...",1k-5k employees,Public,173,,"['c#', 'c++', 'lua', 'python', 'node.js', 'mac...","['Robust medical, dental, and vision coverage'..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
43,Stack Overflow,"Advertising, Enterprise Software",501-1k employees,2008,Private,1.1k,"['c#', 'asp.net-mvc', 'redis', 'sql-server', '...","['20+ days vacation to start (US), 25+ days (U..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
44,Crossover,"Education Technology, IT Recruitment, Software...",1k-5k employees,2014,VC Funded,60,"['artificial-intelligence', 'ruby-on-rails', '...","['High pay in USD', 'Only the top 1% of develo..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
45,Warner Bros. Discovery,"Broadcast, Digital Media, Entertainment",10k+ employees,Public,448,,"['java', 'spring-boot', 'javascript', 'python-...","['Health insurance (medical, dental, vision, p..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
20,Belden Inc.,Industrial Automation,5k-10k employees,1902,Public,85.0,"['opc-ua', 'node-red', 'network-security', 'c'...","['Employee stock purchase plan', 'Bravo recogn..."
46,Siemens AG,Industrial Automation,10k+ employees,1847,336,,"['python', 'c', 'java', 'c++', 'go', 'javascri...","['Promoting life-work balance', 'Supported wel..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
47,CGI (Canada),"Business Process Outsourcing, IT Consulting, S...",10k+ employees,1976,Private,239,"['azure', 'amazon-web-services', 'google-cloud...","['Share purchase plan', 'Health insurance (sho..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
48,Equinor,"Energy & Utilities, Green Energy, Oil & Gas",10k+ employees,1972,Private,95,"['javascript', 'typescript', 'c#', 'python', '...","['Promoting life-work balance', 'Supported wel..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
49,Novo Nordisk A/S,"Manufacturing, Pharmaceuticals",10k+ employees,1923,Public,409,"['javascript', 'html', 'css', 'sql', 'python',...","['Working with highly skilled developers', 'Wo..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
50,Amperity,Customer Data Platform,201-500 employees,2015,Private,87,"['clojure', 'clojurescript', 'scala', 'node.js...","['Paid Time Off', 'Mentorship Program', 'Paren..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
51,TESCO PLC,"Information Technology, Retail",1k-5k employees,1919,Public,92,"['java', 'automation', 'node.js', 'docker', 't...",['Blended approach to home and office working'...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
52,Procter & Gamble,"Consumer Goods, Intelligence Hub, Product Deve...",10k+ employees,1837,Public,50,"['azure', 'python', 'databricks', 'spark', 'sq...","['Financial wellbeing', 'Work life balance', '..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
53,Bloomberg,"Data & Analytics, Financial Technology, News",10k+ employees,1981,Private,480,"['c', 'c++', 'c#', '.net', 'java', 'javascript...","['20+ days of vacation', 'Exclusive discounts ..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
54,Caterpillar Inc.,"Internet of Things, Manufacturing, Software De...",10k+ employees,Public,435,,"['java', 'javascript', 'python', 'scala', 'sql...","['Flexible and Hybrid Work options', 'Health, ..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
55,Smals,"Cloud Computing, Information Technology, Softw...",1k-5k employees,Private,127,,"['java', 'javascript', '.net', 'php', 'c#', 's...",['Challenging projects with a positive impact ...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
56,WirelessCar,Automotive,501-1k employees,1999,Private,126,"['amazon-web-services', 'azure', 'serverless',...","['6 weeks vacation', 'Market salaries, bonus, ..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
57,KVK,Government,1k-5k employees,1803,Public,53,"['.net', 'react', 'java', 'angular', 'c#', 'ty...","['Hybride werken', 'Agile werkwijze', 'Werken ..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
58,Redgate Software,"Computer Software, Databases, DevOps",201-500 employees,1999,Private,177,"['.net', 'sql', 'asp.net-mvc', 'java', 'agile'...","['Access to Unmind, our wellbeing platform.', ..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
59,UBS,"Financial Services, Financial Technology",10k+ employees,Private,103,,"['javascript', 'java', 'kotlin', 'kubernetes',...",['Working with cutting edge tech and agile pra...
63,Capital One - US,"Financial Services, Financial Technology",10k+ employees,1994,Public,4.0,"['javascript', 'java', 'cassandra', 'hbase', '...","['Financial + Retirement', 'Child Care + Paren..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
60,ZEISS Group,"Machine Learning, Medical Devices, Virtual Rea...",10k+ employees,1846,Private,218,"['angular', 'swift', 'c#', 'azure', 'javascrip...","['Flexible working time models', 'Open, dialog..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
61,Citi,"Finance, Financial Technology, Risk Management",10k+ employees,Public,192,,"['java', 'python', 'api', 'management', 'c#', ...","['Medical, Dental and Vision coverage', 'Paid ..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
62,KfW,5k-10k employees,1948,Public,52,,"['java', 'pl1', 'javascript', 'python', 'sap',...","['Marktgerechte Vergütung', 'Betriebliche Alte..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
59,UBS,"Financial Services, Financial Technology",10k+ employees,Private,103,,"['javascript', 'java', 'kotlin', 'kubernetes',...",['Working with cutting edge tech and agile pra...
63,Capital One - US,"Financial Services, Financial Technology",10k+ employees,1994,Public,4.0,"['javascript', 'java', 'cassandra', 'hbase', '...","['Financial + Retirement', 'Child Care + Paren..."


In [None]:
# Checked the current data type for each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   company         64 non-null     object
 1   industry        64 non-null     object
 2   size            64 non-null     object
 3   year_founded    64 non-null     object
 4   status          64 non-null     object
 5   follower_count  51 non-null     object
 6   tech_stack      64 non-null     object
 7   benefits        64 non-null     object
dtypes: object(8)
memory usage: 4.1+ KB


In [None]:
# Set specific data types for each column in the dataset
"""
df["company"] = df["company"].astype("string")
df["industry"] = df["industry"].astype("string")
df["size"] = df["size"].

"""

'\ndf["company"] = df["company"].astype("string")\ndf["industry"] = df["industry"].astype("string")\ndf["size"] = df["size"].\n\n'

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   company         64 non-null     string
 1   industry        64 non-null     object
 2   size            64 non-null     object
 3   year_founded    64 non-null     object
 4   status          64 non-null     object
 5   follower_count  51 non-null     object
 6   tech_stack      64 non-null     object
 7   benefits        64 non-null     object
dtypes: object(7), string(1)
memory usage: 4.1+ KB


In [None]:
# Separated the "tech_stack" column into multiple columns organized by type of tech
# New Columns: "Cloud Services and Infrastructure", "Programing Languages and Frameworks", "Databases and Data Technologies"
