# Stack Overflow Webscraping Project

Description here

In [1]:
# Import libraries and modules
import pandas as pd
import ast
import time
import requests
from bs4 import BeautifulSoup

## Webscraping Stack Overflow

In [2]:
# Created an empty dataframe with the desired columns
df = pd.DataFrame(
    columns = [["company", "industry", "size", "year_founded", "status", "follower_count", "tech_stack", "benefits"]]
)

display(df)

Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits


In [3]:
# Created a function that collects spcific data points from stack overflow
def add_data_to_df(soupy):
    company_name = soupy.find("h1").text.strip() # Extract --> Company Name

    
    about_company = soupy.find_all("span", class_ = "d-block")


    indices = [12,13,14,15,16] # The indicies 12, 13, 14, 15,and 16 represents industy, size, year founded, status, and followers
    values = []

    for index in indices: # This loop ignores IndexErrors when running the function for certain companies that are missing data
        try:
            values.append(about_company[index].text.strip())
        except IndexError:
            values.append(None) 

    industry = values[0]
    size = values[1]
    year_founded = values[2]
    status = values[3]
    followers = values[4]

    tech_skills = soupy.find_all("a", class_ = "flex--item s-tag no-tag-menu") # Searched for the company's tech stack
    tech_stack = [] # Created a list containing all relevant tech skills 

    for item in range(len(tech_skills)):
        tech_stack.append(tech_skills[item].text.strip())


    benefits_list = soupy.find_all("div", class_="flex--item pl8 pt2 fw-normal fs-body2 fc-black-700") # Searched for company benefits
    benefits = [] # Created a list containing all listed company benefits

    for item in range(len(benefits_list)):
        benefits.append(benefits_list[item].text.strip())
    
    
    # Appended all webscrapped elements into a list "new_row"

    new_row = [] 

    new_row.append(company_name)
    new_row.append(industry)
    new_row.append(size)
    new_row.append(year_founded)
    new_row.append(status)
    new_row.append(followers)
    new_row.append(tech_stack)
    new_row.append(benefits)

    df.loc[len(df.index)] = new_row # Added the list as a new row in the dataframe

In [4]:
# Accessing each link 

def scrape_page(page_link):

    company_list = page_link.find_all("a", class_="s-link", href=lambda href: href and "/jobs/companies" in href) # Accessed each <a> tag that includes href and "/jobs/companies"

    company_links = []

    for names in range(len(company_list)):
        company_links.append("https://stackoverflow.com" + company_list[names]["href"]) # The extracted links combines with "https://stackoverflow.com" generates the full link for each company's page on stack overflow


    # Created a for loop that loops through each company page at least three times
    for link in range(len(company_links)):  
        response = requests.get(company_links[link])
        max_attempts = 3 

        while max_attempts > 0:

            if response.status_code == 200:
                print("Successful connection")
                soupy = BeautifulSoup(response.text, "html")
                add_data_to_df(soupy)
                break

            elif response.status_code == 429:
                print("Response 429, reattempting...")
                time.sleep(5) # If the server is handling too many request --> wait five seconds and reattempt connection
                max_attempts -=1

            elif response.status_code == 403:
                print("Access Denied")
                break
            
            else:
                print(response.status_code)
                break

In [5]:
# Created a for loop that checks the connection of each page and scrapes data from each company's profile page through accessing their individual links

connection_attempts = 3
max_pages = 7

for page in range(1, max_pages + 1):
    url = f"https://stackoverflow.com/jobs/companies?pg={page}"
     
    while connection_attempts > 0:
        response = requests.get(url)

        if response.status_code == 200: # used a similar for loop structure to the "scrape_page" function created above
            print(f"Successfully connected to page:{page}")
            soup = BeautifulSoup(response.text, "html")
            scrape_page(soup)
            break

        elif response.status_code == 429:
            print("Response [429]: Reattempting to connect")
            time.sleep(5)
            connection_attempts -= 1

        elif response.status_code == 403:
                print("Response [403]: Access denied") 
                break
        else:
            print("Invalid link")
            break
    if connection_attempts == 0:
        print("Max number of attempts excceded")

Successfully connected to page:1
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successfully connected to page:2
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successfully connected to page:3
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successfully connected to page:4
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful connection
Successful

In [6]:
# Show the extracted data
display(df)

Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
0,Audible,"Agile Software Development, Content Marketing,...",1k-5k employees,1995,Public,530,"[java, git, jquery, angular.js, requires.js, h...","[Health, Vision, & Dental, Amazon Restricted S..."
1,DISH,"Cloud Services, Information Technology, Wirele...",10k+ employees,1980,Public,169,"[amazon-web-services, azure, devops, f5, linux...","[Diversity employee resource groups, Tuition r..."
2,Global Industrial,"B2B Sales, eCommerce, Information Technology",1k-5k employees,1949,Public,109,"[java, jakarta-ee, spring, reactjs, next.js, j...",[Employee Stock Purchase Plan – 15% discount o...
3,National Security Agency,"Cybersecurity, Federal Agencies, Signals Analysis",10k+ employees,1952,Public,724,"[java, c, c++, angular, javascript, node.js]",[Work that contributes to the mission of prote...
4,EMBL-EBI (EMBL's European Bioinformatics Insti...,"Big Data, Data Science, Life Sciences",501-1k employees,1992,Public,165,"[javascript, java, angular, reactjs, node.js, ...","[Private Health Insurance, 30 days annual leav..."
...,...,...,...,...,...,...,...,...
58,ZEISS Group,"Machine Learning, Medical Devices, Virtual Rea...",10k+ employees,1846,Private,220,"[angular, swift, c#, azure, javascript, python...","[Flexible working time models, Open, dialogue-..."
59,Citi,"Finance, Financial Technology, Risk Management",10k+ employees,Public,196,,"[java, python, api, management, c#, ai, cloud-...","[Medical, Dental and Vision coverage, Paid Tim..."
60,Capital One – Mexico City,"Financial Services, Financial Technology",10k+ employees,1994,Public,5,"[javascript, scala, python, java, cassandra, h...","[Health Insurance + Wellness, Vacation + Paid ..."
61,ADT LLC,"Home Security, Information Technology, Product...",10k+ employees,1874,Public,1,"[dynatrace, dockerfile, terraform, linux, kube...","[Medical, dental, prescription drug, and visio..."


## Data Cleaning and Feature Engineering

### Correcting Misplaced Values

In [7]:
# Created a csv file based on the extracted data
df.to_csv("companies.csv", index = False)
df = pd.read_csv("companies.csv")
display(df)

Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
0,Audible,"Agile Software Development, Content Marketing,...",1k-5k employees,1995,Public,530,"['java', 'git', 'jquery', 'angular.js', 'requi...","['Health, Vision, & Dental', 'Amazon Restricte..."
1,DISH,"Cloud Services, Information Technology, Wirele...",10k+ employees,1980,Public,169,"['amazon-web-services', 'azure', 'devops', 'f5...","['Diversity employee resource groups', 'Tuitio..."
2,Global Industrial,"B2B Sales, eCommerce, Information Technology",1k-5k employees,1949,Public,109,"['java', 'jakarta-ee', 'spring', 'reactjs', 'n...",['Employee Stock Purchase Plan – 15% discount ...
3,National Security Agency,"Cybersecurity, Federal Agencies, Signals Analysis",10k+ employees,1952,Public,724,"['java', 'c', 'c++', 'angular', 'javascript', ...",['Work that contributes to the mission of prot...
4,EMBL-EBI (EMBL's European Bioinformatics Insti...,"Big Data, Data Science, Life Sciences",501-1k employees,1992,Public,165,"['javascript', 'java', 'angular', 'reactjs', '...","['Private Health Insurance', '30 days annual l..."
...,...,...,...,...,...,...,...,...
58,ZEISS Group,"Machine Learning, Medical Devices, Virtual Rea...",10k+ employees,1846,Private,220,"['angular', 'swift', 'c#', 'azure', 'javascrip...","['Flexible working time models', 'Open, dialog..."
59,Citi,"Finance, Financial Technology, Risk Management",10k+ employees,Public,196,,"['java', 'python', 'api', 'management', 'c#', ...","['Medical, Dental and Vision coverage', 'Paid ..."
60,Capital One – Mexico City,"Financial Services, Financial Technology",10k+ employees,1994,Public,5,"['javascript', 'scala', 'python', 'java', 'cas...","['Health Insurance + Wellness', 'Vacation + Pa..."
61,ADT LLC,"Home Security, Information Technology, Product...",10k+ employees,1874,Public,1,"['dynatrace', 'dockerfile', 'terraform', 'linu...","['Medical, dental, prescription drug, and visi..."


#### Correcting Misplaced "industry" Column Values

In [8]:
# Identified the companies that have misplaced industry values
# Found that certain companies did not list their industry, causing the data to be shifted one value to the left
industry_list = df["industry"].values

for item in industry_list:
    if str("employees") in str(item): 
        display(df[df["industry"] == item])


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
49,Novo Nordisk A/S,10k+ employees,1923,Public,412,,"['javascript', 'html', 'css', 'sql', 'python',...","['Working with highly skilled developers', 'Wo..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
55,KVK,1k-5k employees,1803,Public,53,,"['.net', 'react', 'java', 'angular', 'c#', 'ty...","['Hybride werken', 'Agile werkwijze', 'Werken ..."


In [9]:
# Corrected the misplaced values by shifting values under the "industry", "size", "year_founded", "status", and "follower_count" one value to the right
# Removed the "NaN" value from the "follower_count" column and added it to the "industry" column
industry_list = df["industry"].values

for item in industry_list:
    if "employees" in item: 

        index = df[df["industry"] == item].index
        
        size = df.loc[index, "industry"]
        year_founded = df.loc[index, "size"]
        status = df.loc[index, "year_founded"]
        follower_count = df.loc[index, "status"]

        df.loc[index, "industry"] = None
        df.loc[index, "size"] = size
        df.loc[index, "year_founded"] = year_founded
        df.loc[index, "status"] = status
        df.loc[index, "follower_count"] = follower_count
        
        display(df.iloc[index])
    else: 
        pass


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
49,Novo Nordisk A/S,,10k+ employees,1923,Public,412,"['javascript', 'html', 'css', 'sql', 'python',...","['Working with highly skilled developers', 'Wo..."


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
55,KVK,,1k-5k employees,1803,Public,53,"['.net', 'react', 'java', 'angular', 'c#', 'ty...","['Hybride werken', 'Agile werkwijze', 'Werken ..."


#### Correcting Misplaced "year_founded" and "status" Column Values

In [10]:
# Identified companies that misplaced "year_founded" and "status" values 
# Noted that the rows with incorrect values for the "year_founded" column also have no value in its "follower_count" column, meaning these companies are missing "year_founded" data
year_founded_values = df["year_founded"].unique()

for value in year_founded_values:
    if ("Public" in value) or ("Private" in value) or ("VC Funded" in value):
        display(df[df["year_founded"] == value])
        

Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
9,Deutsche Börse Group,"Customer Data Platform, Financial Technology, ...",10k+ employees,Private,103,,"['c++', 'java', 'python', 'google-cloud-platfo...",['HYBRID WORK – our working model combines the...
37,Gatekeeper,"Computer Software, Enterprise Software, Legal ...",51-200 employees,Private,42,,"['ruby-on-rails', 'javascript', 'hotwire-rails...",['Fully remote working. Our team currently spa...
53,Smals,"Cloud Computing, Information Technology, Softw...",1k-5k employees,Private,127,,"['java', 'javascript', '.net', 'php', 'c#', 's...",['Challenging projects with a positive impact ...
57,UBS,"Financial Services, Financial Technology",10k+ employees,Private,104,,"['javascript', 'java', 'kotlin', 'kubernetes',...",['Working with cutting edge tech and agile pra...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
10,"Logius, onderdeel ministerie BZK","Computer Software, Government, IT Consulting",501-1k employees,Public,44,,"['grafana', 'kibana', 'thanos', 'minio', 'harb...","['Individueel keuze budget (IKB) van 16,37% ov..."
43,Roblox,"Communications, Software Development, Virtual ...",1k-5k employees,Public,177,,"['c#', 'c++', 'lua', 'python', 'node.js', 'mac...","['Robust medical, dental, and vision coverage'..."
45,Warner Bros. Discovery,"Broadcast, Digital Media, Entertainment",10k+ employees,Public,451,,"['java', 'spring-boot', 'javascript', 'python-...","['Health insurance (medical, dental, vision, p..."
52,Caterpillar Inc.,"Internet of Things, Manufacturing, Software De...",10k+ employees,Public,438,,"['java', 'javascript', 'python', 'scala', 'sql...","['Flexible and Hybrid Work options', 'Health, ..."
59,Citi,"Finance, Financial Technology, Risk Management",10k+ employees,Public,196,,"['java', 'python', 'api', 'management', 'c#', ...","['Medical, Dental and Vision coverage', 'Paid ..."


In [11]:
# Shifted the "year_founded", "status", and "follower_count" column one cell to the right and set "year_founded" column values to None 
for value in year_founded_values:
    if ("Public" in value) or ("Private" in value) or ("VC Funded" in value):

        index = df[df["year_founded"] == value].index
        
        df.loc[index, "follower_count"] = df.loc[index, "status"] 
        df.loc[index, "status"] = df.loc[index, "year_founded"]
        df.loc[index, "year_founded"] = None

        display(df.iloc[index])
    else:
        pass
        

Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
9,Deutsche Börse Group,"Customer Data Platform, Financial Technology, ...",10k+ employees,,Private,103,"['c++', 'java', 'python', 'google-cloud-platfo...",['HYBRID WORK – our working model combines the...
37,Gatekeeper,"Computer Software, Enterprise Software, Legal ...",51-200 employees,,Private,42,"['ruby-on-rails', 'javascript', 'hotwire-rails...",['Fully remote working. Our team currently spa...
53,Smals,"Cloud Computing, Information Technology, Softw...",1k-5k employees,,Private,127,"['java', 'javascript', '.net', 'php', 'c#', 's...",['Challenging projects with a positive impact ...
57,UBS,"Financial Services, Financial Technology",10k+ employees,,Private,104,"['javascript', 'java', 'kotlin', 'kubernetes',...",['Working with cutting edge tech and agile pra...


Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
10,"Logius, onderdeel ministerie BZK","Computer Software, Government, IT Consulting",501-1k employees,,Public,44,"['grafana', 'kibana', 'thanos', 'minio', 'harb...","['Individueel keuze budget (IKB) van 16,37% ov..."
43,Roblox,"Communications, Software Development, Virtual ...",1k-5k employees,,Public,177,"['c#', 'c++', 'lua', 'python', 'node.js', 'mac...","['Robust medical, dental, and vision coverage'..."
45,Warner Bros. Discovery,"Broadcast, Digital Media, Entertainment",10k+ employees,,Public,451,"['java', 'spring-boot', 'javascript', 'python-...","['Health insurance (medical, dental, vision, p..."
52,Caterpillar Inc.,"Internet of Things, Manufacturing, Software De...",10k+ employees,,Public,438,"['java', 'javascript', 'python', 'scala', 'sql...","['Flexible and Hybrid Work options', 'Health, ..."
59,Citi,"Finance, Financial Technology, Risk Management",10k+ employees,,Public,196,"['java', 'python', 'api', 'management', 'c#', ...","['Medical, Dental and Vision coverage', 'Paid ..."


### Spliting the "tech_stack" and "benefits" Columns into Multiple Columns

#### Spliting Up the "tech_stack" Column 

In [13]:
df["tech_stack"][0]

"['java', 'git', 'jquery', 'angular.js', 'requires.js', 'html5', 'css', 'css3', 'javascript', 'perl', 'python', 'ant', 'maven', 'jrebel', 'http', 'live', 'streaming', 'hds', 'smoothstreaming', 'amazon', 'web', 'services', 'ruby', 'cucumber', 'dojo', 'mason', 'jsp', 'junit', 'testng', 'selenium', 'sql', 'hadoop', 'ios', 'objective-c', 'android', 'underscore', 'js', 'sass', 'amd', 'node', 'ipad', 'spring', 'r', 'rstudio']"

In [14]:

df["tech_stack"] = df["tech_stack"].apply(ast.literal_eval) # Changes the column data type from string to list


print(df["tech_stack"].iloc[0])  


['java', 'git', 'jquery', 'angular.js', 'requires.js', 'html5', 'css', 'css3', 'javascript', 'perl', 'python', 'ant', 'maven', 'jrebel', 'http', 'live', 'streaming', 'hds', 'smoothstreaming', 'amazon', 'web', 'services', 'ruby', 'cucumber', 'dojo', 'mason', 'jsp', 'junit', 'testng', 'selenium', 'sql', 'hadoop', 'ios', 'objective-c', 'android', 'underscore', 'js', 'sass', 'amd', 'node', 'ipad', 'spring', 'r', 'rstudio']


In [15]:
# Obtain all unique values from the tech stack column
tech_stack_list = []

for item in range(len(df["tech_stack"])):
    row = df["tech_stack"][item]
    
    for index in range(len(row)):
        tech_stack_list.append(row[index])

tech_stack_df = pd.DataFrame({"Tech_stack": tech_stack_list})
tech_stack_df

Unnamed: 0,Tech_stack
0,java
1,git
2,jquery
3,angular.js
4,requires.js
...,...
1785,nosql
1786,docker
1787,kubernetes
1788,spring-boot


In [16]:
tech_stack_df.duplicated().sum()

1192

In [17]:
tech_stack_df = tech_stack_df.drop_duplicates()
tech_stack_df

Unnamed: 0,Tech_stack
0,java
1,git
2,jquery
3,angular.js
4,requires.js
...,...
1714,data-security
1733,rdbms
1739,dockerfile
1746,confluence


In [30]:
display(tech_stack_df)

Unnamed: 0,Tech_stack
0,java
1,git
2,jquery
3,angular.js
4,requires.js
...,...
1714,data-security
1733,rdbms
1739,dockerfile
1746,confluence


In [33]:
tech_stack_df["Tech_stack"].unique()

array(['java', 'git', 'jquery', 'angular.js', 'requires.js', 'html5',
       'css', 'css3', 'javascript', 'perl', 'python', 'ant', 'maven',
       'jrebel', 'http', 'live', 'streaming', 'hds', 'smoothstreaming',
       'amazon', 'web', 'services', 'ruby', 'cucumber', 'dojo', 'mason',
       'jsp', 'junit', 'testng', 'selenium', 'sql', 'hadoop', 'ios',
       'objective-c', 'android', 'underscore', 'js', 'sass', 'amd',
       'node', 'ipad', 'spring', 'r', 'rstudio', 'amazon-web-services',
       'azure', 'devops', 'f5', 'linux', 'adobe-commerce',
       'adobe-experience-manager', 'alm', 'ansible', 'apache',
       'apache-kafka', 'api', 'apigee', 'bdd', 'c#', 'c++', 'cisco',
       'aws-cloudformation', 'cloudnit', 'confluent-platform', 'docker',
       'amazon-elastic-beanstalk', 'elasticsearch', 'esxi', 'fabric',
       'gitlab', 'go', 'google-cloud-platform', 'influxdb', 'jakarta-ee',
       'json', 'kanban', 'kubernetes', 'ian', 'unix', 'minitest',
       'mulesoft', 'mysql', 'new

In [38]:
# Utilized ChatGPT to organize and sort the list of unique technologies into specific categories based on function

programming_languages = [
    'Java', 'C', 'C#', 'C++', 'Perl', 'Python', 'Ruby', 'PHP', 'Kotlin', 
    'Swift', 'Objective-C', 'Rust', 'Lua', 'Scala', 'Go', 'R', 'MATLAB', 
    'Haskell', 'Dart'
]

web_development = [
    'HTML', 'HTML5', 'CSS', 'CSS3', 'JavaScript', 'TypeScript', 'JQuery', 
    'React', 'ReactJS', 'Vue.js', 'Angular', 'Angular.js', 'Bootstrap-5', 
    'Next.js', 'Sass', 'Ember.js', 'Dojo', 'Styled-Components', 'Knockout.js', 
    'Thymeleaf', 'ASP.NET MVC', 'Node.js', 'Spring', 'Spring Boot', 'Django', 
    'Flask', 'Express', 'Ruby on Rails', '.NET', 'ASP.NET', 'ASP.NET Core', 
    'PHP', 'Java EE', 'Wicket', 'JSP', 'JPA', 'Hibernate', 'Quarkus', 'Grails'
]

cloud_platforms = [
    'Amazon Web Services (AWS)', 'Azure', 'Google Cloud Platform (GCP)', 
    'IBM Cloud', 'Oracle Cloud', 'Rackspace', 'VMware', 'Alibaba Cloud', 
    'AWS CloudFormation', 'AWS Lambda', 'Amazon EC2', 'Amazon RDS', 
    'Amazon S3', 'Amazon DynamoDB', 'Amazon Athena', 'Amazon Glue', 
    'Amazon Aurora', 'Amazon Kinesis', 'AWS IoT', 'AWS Glue', 'Amazon Redshift', 
    'AWS RDS', 'Azure Synapse', 'Azure Data Lake', 'Azure Cosmos DB', 
    'Azure OpenAI', 'Azure API Management', 'Azure DevOps'
]

devops_infrastructure = [
    'DevOps', 'Jenkins', 'Docker', 'Kubernetes', 'Terraform', 'Ansible', 
    'Git', 'GitLab', 'GitHub Actions', 'GitLab CI', 'CircleCI', 'Octopus Deploy', 
    'TeamCity', 'CloudFoundry', 'Rancher', 'AKS', 'OpenShift', 'Openshift Origin', 
    'ArgoCD', 'CI/CD', 'IaC (Infrastructure as Code)', 'Helm', 'Packer', 
    'Confluent Platform', 'Prometheus', 'Grafana', 'Kibana', 'Datadog', 
    'Splunk', 'Sentry', 'Elastic Stack', 'ELK', 'Fluentd', 'Fluent-bit', 
    'Chef', 'Puppet', 'Mulesoft', 'Nomad', 'Consul'
]

databases_data_management = [
    'MySQL', 'PostgreSQL', 'PostgreSQL-9.1', 'MSSQL', 'T-SQL', 'Teradata', 
    'SQLite', 'MariaDB', 'SQL Server', 'DB2', 'MongoDB', 'Cassandra', 
    'Couchbase', 'HBase', 'DynamoDB', 'Redis', 'InfluxDB', 'Elasticsearch', 
    'Clickhouse', 'Lucene', 'Solr', 'Neo4j', 'Apache Flink', 'Amazon Athena', 
    'Hadoop', 'HDFS', 'Apache Kafka', 'Apache Spark', 'Hive', 'HiveQL', 
    'Trino', 'Apache Beam', 'Databricks', 'Snowflake', 'Redshift', 
    'Data Lake', 'CosmosDB', 'Data Mesh', 'Data Fabric'
]

machine_learning_ai = [
    'Machine Learning', 'Deep Learning', 'TensorFlow', 'PyTorch', 'GPT-4', 
    'Large Language Models (LLMs)', 'Natural Language Processing (NLP)', 
    'Conversational AI', 'Neural Networks', 'Approximate NN Searching', 
    'Machine Translation', 'Computer Vision', 'Data Science', 
    'Supervised Learning', 'OpenAI API', 'Azure OpenAI'
]

version_control_collaboration = [
    'Git', 'GitHub', 'GitLab', 'Bitbucket', 'SVN', 'Confluence', 'Jira', 
    'Miro', 'Figma', 'Jenkins', 'Sonarqube', 'Testrail', 'GitHub Copilot'
]

software_testing_automation = [
    'Selenium', 'JUnit', 'JUnit Jupiter', 'TestNG', 'Mocha', 'Jest', 
    'Cucumber', 'RSpec', 'PyTest', 'ReadyAPI', 'Minitest', 'Fitnesse', 
    'Automation', 'Performance Testing', 'Load Balancing', 'Continuous Integration', 
    'Continuous Deployment (CI/CD)', 'Automated Testing', 'Code Coverage', 
    'Chaos Testing'
]

mobile_development = [
    'Android', 'iOS', 'Objective-C', 'Swift', 'Kotlin', 'Flutter', 
    'React Native', 'Xamarin', 'Cordova', 'Ionic Framework', 'CocoaPods', 
    'Appium', 'Android Honeycomb (3.0)'
]

networking_security = [
    'TCP/IP', 'TLS', 'SSH', 'SAML', 'OAuth', 'OAuth 2.0', 'OpenVPN', 
    'SSL', 'RDP', 'IAM (Identity Access Management)', 'Okta', 'Apigee', 
    'F5', 'Imperva', 'Vault', 'DevSecOps', 'OpenID Connect', 'Cert-Manager', 
    'OAuth', 'OpenAI', 'OAuth 2.0', 'Network Security', 'SSL/TLS'
]

containerization = [
    'Docker', 'Dockerfile', 'Kubernetes', 'OpenShift', 'Helm', 'Kubernetes-native (K8s)', 
    'ECS (Elastic Container Service)', 'Nomad', 'Istio', 'OpenTelemetry'
]

business_tools = [
    'Tableau', 'Power BI', 'Salesforce', 'Alteryx', 'Coupa', 'Celonis', 
    'SAP (multiple modules like S4 HANA, SAP SuccessFactors)', 'BizTalk', 
    'Dynamics 365', 'Manhattan WMOS', 'IBM Mainframe', 'SharePoint', 
    'SAP Solution Manager', 'Oracle EBS', 'Signavio'
]

other_technologies = [
    'Jenkins', 'JMeter', 'HP-UFT', 'Cypress', 'Postman', 'TDD', 'BDD', 
    'Ranorex', 'GitLab CI', 'Chef Infra', 'Azure DevOps', 'Octopus Deploy', 
    'Spring', 'Spring Boot', 'Node.js', 'Apache Kafka', 'Kafka Consumer API', 
    'React Native', 'Redux', 'Dagger', 'Akka', 'Quarkus', 'Java EE', 
    'Hibernate', 'Thymeleaf', 'Fitnesse', 'Linux', 'Unix', 'Windows', 
    'macOS', 'ESXi', 'RedHat', 'CentOS', 'SPSS', 'Splunk', 'NewRelic', 
    'Datadog', 'Dynatrace', 'Piwik'
]

tech_stack = [
    programming_languages, web_development, cloud_platforms, devops_infrastructure, 
    databases_data_management, machine_learning_ai, version_control_collaboration, software_testing_automation, 
    mobile_development, networking_security, containerization, business_tools, other_technologies
]

In [51]:
test_df = df.copy()
test_df

Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
0,Audible,"Agile Software Development, Content Marketing,...",1k-5k employees,1995,Public,530,"[java, git, jquery, angular.js, requires.js, h...","['Health, Vision, & Dental', 'Amazon Restricte..."
1,DISH,"Cloud Services, Information Technology, Wirele...",10k+ employees,1980,Public,169,"[amazon-web-services, azure, devops, f5, linux...","['Diversity employee resource groups', 'Tuitio..."
2,Global Industrial,"B2B Sales, eCommerce, Information Technology",1k-5k employees,1949,Public,109,"[java, jakarta-ee, spring, reactjs, next.js, j...",['Employee Stock Purchase Plan – 15% discount ...
3,National Security Agency,"Cybersecurity, Federal Agencies, Signals Analysis",10k+ employees,1952,Public,724,"[java, c, c++, angular, javascript, node.js]",['Work that contributes to the mission of prot...
4,EMBL-EBI (EMBL's European Bioinformatics Insti...,"Big Data, Data Science, Life Sciences",501-1k employees,1992,Public,165,"[javascript, java, angular, reactjs, node.js, ...","['Private Health Insurance', '30 days annual l..."
...,...,...,...,...,...,...,...,...
58,ZEISS Group,"Machine Learning, Medical Devices, Virtual Rea...",10k+ employees,1846,Private,220,"[angular, swift, c#, azure, javascript, python...","['Flexible working time models', 'Open, dialog..."
59,Citi,"Finance, Financial Technology, Risk Management",10k+ employees,,Public,196,"[java, python, api, management, c#, ai, cloud-...","['Medical, Dental and Vision coverage', 'Paid ..."
60,Capital One – Mexico City,"Financial Services, Financial Technology",10k+ employees,1994,Public,5,"[javascript, scala, python, java, cassandra, h...","['Health Insurance + Wellness', 'Vacation + Pa..."
61,ADT LLC,"Home Security, Information Technology, Product...",10k+ employees,1874,Public,1,"[dynatrace, dockerfile, terraform, linux, kube...","['Medical, dental, prescription drug, and visi..."


In [52]:
test_df[[
    "programming_languages", "web_development", "cloud_platforms", "devops_infrastructure", 
    "databases_data_management", "machine_learning_ai", "version_control_collaboration", "software_testing_automation", 
    "mobile_development", "networking_security","containerization", "business_tools", "other_technologies"
]] = None

test_df

Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits,programming_languages,web_development,...,devops_infrastructure,databases_data_management,machine_learning_ai,version_control_collaboration,software_testing_automation,mobile_development,networking_security,containerization,business_tools,other_technologies
0,Audible,"Agile Software Development, Content Marketing,...",1k-5k employees,1995,Public,530,"[java, git, jquery, angular.js, requires.js, h...","['Health, Vision, & Dental', 'Amazon Restricte...",,,...,,,,,,,,,,
1,DISH,"Cloud Services, Information Technology, Wirele...",10k+ employees,1980,Public,169,"[amazon-web-services, azure, devops, f5, linux...","['Diversity employee resource groups', 'Tuitio...",,,...,,,,,,,,,,
2,Global Industrial,"B2B Sales, eCommerce, Information Technology",1k-5k employees,1949,Public,109,"[java, jakarta-ee, spring, reactjs, next.js, j...",['Employee Stock Purchase Plan – 15% discount ...,,,...,,,,,,,,,,
3,National Security Agency,"Cybersecurity, Federal Agencies, Signals Analysis",10k+ employees,1952,Public,724,"[java, c, c++, angular, javascript, node.js]",['Work that contributes to the mission of prot...,,,...,,,,,,,,,,
4,EMBL-EBI (EMBL's European Bioinformatics Insti...,"Big Data, Data Science, Life Sciences",501-1k employees,1992,Public,165,"[javascript, java, angular, reactjs, node.js, ...","['Private Health Insurance', '30 days annual l...",,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,ZEISS Group,"Machine Learning, Medical Devices, Virtual Rea...",10k+ employees,1846,Private,220,"[angular, swift, c#, azure, javascript, python...","['Flexible working time models', 'Open, dialog...",,,...,,,,,,,,,,
59,Citi,"Finance, Financial Technology, Risk Management",10k+ employees,,Public,196,"[java, python, api, management, c#, ai, cloud-...","['Medical, Dental and Vision coverage', 'Paid ...",,,...,,,,,,,,,,
60,Capital One – Mexico City,"Financial Services, Financial Technology",10k+ employees,1994,Public,5,"[javascript, scala, python, java, cassandra, h...","['Health Insurance + Wellness', 'Vacation + Pa...",,,...,,,,,,,,,,
61,ADT LLC,"Home Security, Information Technology, Product...",10k+ employees,1874,Public,1,"[dynatrace, dockerfile, terraform, linux, kube...","['Medical, dental, prescription drug, and visi...",,,...,,,,,,,,,,


In [48]:
tech_stack[0][0]

'Java'

In [54]:
test_df["tech_stack"].iloc[0][0]

'java'

### Checking and Setting Column Data Types

In [19]:
# Checked the current data type for each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   company         63 non-null     object
 1   industry        61 non-null     object
 2   size            63 non-null     object
 3   year_founded    54 non-null     object
 4   status          63 non-null     object
 5   follower_count  62 non-null     object
 6   tech_stack      63 non-null     object
 7   benefits        63 non-null     object
dtypes: object(8)
memory usage: 4.1+ KB


In [20]:
df

Unnamed: 0,company,industry,size,year_founded,status,follower_count,tech_stack,benefits
0,Audible,"Agile Software Development, Content Marketing,...",1k-5k employees,1995,Public,530,"[java, git, jquery, angular.js, requires.js, h...","['Health, Vision, & Dental', 'Amazon Restricte..."
1,DISH,"Cloud Services, Information Technology, Wirele...",10k+ employees,1980,Public,169,"[amazon-web-services, azure, devops, f5, linux...","['Diversity employee resource groups', 'Tuitio..."
2,Global Industrial,"B2B Sales, eCommerce, Information Technology",1k-5k employees,1949,Public,109,"[java, jakarta-ee, spring, reactjs, next.js, j...",['Employee Stock Purchase Plan – 15% discount ...
3,National Security Agency,"Cybersecurity, Federal Agencies, Signals Analysis",10k+ employees,1952,Public,724,"[java, c, c++, angular, javascript, node.js]",['Work that contributes to the mission of prot...
4,EMBL-EBI (EMBL's European Bioinformatics Insti...,"Big Data, Data Science, Life Sciences",501-1k employees,1992,Public,165,"[javascript, java, angular, reactjs, node.js, ...","['Private Health Insurance', '30 days annual l..."
...,...,...,...,...,...,...,...,...
58,ZEISS Group,"Machine Learning, Medical Devices, Virtual Rea...",10k+ employees,1846,Private,220,"[angular, swift, c#, azure, javascript, python...","['Flexible working time models', 'Open, dialog..."
59,Citi,"Finance, Financial Technology, Risk Management",10k+ employees,,Public,196,"[java, python, api, management, c#, ai, cloud-...","['Medical, Dental and Vision coverage', 'Paid ..."
60,Capital One – Mexico City,"Financial Services, Financial Technology",10k+ employees,1994,Public,5,"[javascript, scala, python, java, cassandra, h...","['Health Insurance + Wellness', 'Vacation + Pa..."
61,ADT LLC,"Home Security, Information Technology, Product...",10k+ employees,1874,Public,1,"[dynatrace, dockerfile, terraform, linux, kube...","['Medical, dental, prescription drug, and visi..."


In [21]:
# Set specific data types for each column in the dataset

"""
df["company"] = df["company"].astype("string")
df["industry"] = df["industry"].astype("string")
df["size"] = df["size"].astype("string")
df["year_founded"] = df["year_founded"].astype("int64") # issue with int and nonetypes
df["status"] = df["status"].astype("string")
df["follower_count"] = df["follower_count"].astype("int64")
"""


'\ndf["company"] = df["company"].astype("string")\ndf["industry"] = df["industry"].astype("string")\ndf["size"] = df["size"].astype("string")\ndf["year_founded"] = df["year_founded"].astype("int64") # issue with int and nonetypes\ndf["status"] = df["status"].astype("string")\ndf["follower_count"] = df["follower_count"].astype("int64")\n'

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   company         63 non-null     object
 1   industry        61 non-null     object
 2   size            63 non-null     object
 3   year_founded    54 non-null     object
 4   status          63 non-null     object
 5   follower_count  62 non-null     object
 6   tech_stack      63 non-null     object
 7   benefits        63 non-null     object
dtypes: object(8)
memory usage: 4.1+ KB


In [23]:
# Separated the "tech_stack" column into multiple columns organized by type of tech
# New Columns: "Cloud Services and Infrastructure", "Programing Languages and Frameworks", "Databases and Data Technologies"
