In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
raw_txt = None
with open('jd_scraping/output.csv','r') as f:
    raw_text = f.read()

In [3]:
rows = raw_text.split(';')

In [4]:
rows = [row.split('|') for row in rows if row != '']

In [5]:
max([len(r) for r in rows]), min([len(r) for r in rows])

(5, 5)

In [6]:
headers = rows[0]
rows = rows[1:]

In [7]:
rows[0]

['1',
 'Apple',
 'Machine Learning Software Engineer',
 'Here is the comma-separated list of required skills and experience:\n\nbiometrics/face tracking/gaze tracking, modern C++ & Python, algorithm implementation and optimization with platform constraints, computer vision / machine learning product development and delivery, working with sophisticated software stacks, 3 years of professional software development experience.',
 'https://jobs.apple.com/en-us/details/200540712/machine-learning-software-engineer']

In [8]:
data = pd.DataFrame(rows,columns=headers)

In [9]:
data.head()

Unnamed: 0,Srno,Company,Role,Description,Link
0,1,Apple,Machine Learning Software Engineer,Here is the comma-separated list of required s...,https://jobs.apple.com/en-us/details/200540712...
1,2,Apple,Mixed-Signal IP Machine Learning Engineer,"Machine learning, Python, C, VLSI fundamentals...",https://jobs.apple.com/en-us/details/200488665...
2,3,Apple,Senior Machine Learning Software Engineer,Here is the list of required skills and experi...,https://jobs.apple.com/en-us/details/200493299...
3,4,Apple,Pre-Silicon Machine Learning Compiler Engineer,Here is the list of required skills and experi...,https://jobs.apple.com/en-us/details/200520036...
4,5,Apple,Pre-Silicon Machine Learning Compiler Engineer,Here is the list of required skills and experi...,https://jobs.apple.com/en-us/details/200520040...


In [10]:
data['Role'].value_counts()[:50]

Role
Senior Machine Learning Engineer                                               3
Machine Learning Intern                                                        2
Research Scientist                                                             2
Pre-Silicon Machine Learning Compiler Engineer                                 2
Research Engineer                                                              2
Mixed-Signal IP Machine Learning Engineer                                      2
Research Engineer, Interpretability                                            2
Machine Learning GPU Performance Engineer                                      2
Product Engineer, Machine Learning and GPU Accelerators                        2
Research Engineer, Knowledge Bases                                             2
Research Engineer, Human Computer Interfaces                                   2
Backend Engineer, Sora                                                         1
Distributed Systems/ML 

In [11]:
import re
def get_category(role):
    category = 'other'
    role = role.lower()
    role = re.sub(' ', '', role)
    if 'lead' in role or 'leader' in role or 'manager' in role:
        category = 'manager'
    elif 'engineer' in role or 'architect' in role:
        category = 'ml engineer' if 'ml' in role or 'learning' in role or 'research' in role else 'software engineer'
    elif 'research' in role or 'scientist' in role or 'science' in role:
        category = 'scientist'
    
    
    return category


In [12]:
data['Category'] = data['Role'].apply(get_category)

In [13]:
data[['Role','Category']].iloc[:20,:]

Unnamed: 0,Role,Category
0,Machine Learning Software Engineer,ml engineer
1,Mixed-Signal IP Machine Learning Engineer,ml engineer
2,Senior Machine Learning Software Engineer,ml engineer
3,Pre-Silicon Machine Learning Compiler Engineer,ml engineer
4,Pre-Silicon Machine Learning Compiler Engineer,ml engineer
5,Mixed-Signal IP Machine Learning Engineer,ml engineer
6,"Senior Machine Learning Engineer, Health",ml engineer
7,"AIML - Sr Machine Learning Engineer, Data & Ma...",ml engineer
8,"AIML - Machine Learning Engineer or Scientist,...",ml engineer
9,Computer Vision and Machine Learning Engineer,ml engineer


In [14]:
#removing stock response of LLM:
def remove_stock_response(description):
    desc = re.sub('Here is the comma-separated list of required skills and experience:\n\n','',description)
    desc = re.sub('Here is the list of required skills and experience:\n\n','',desc)
    desc = re.sub('Here is the list of required skills and experience:\n\n','',desc)
    desc = re.sub('I apologize*','',desc)
    desc = re.sub('\n','',desc)
    return desc

data['Description'] = data['Description'].apply(remove_stock_response)

In [15]:
import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
# Top skills overall
def get_skills(description):
    all_skills = {}
    for d in description:
        skills = d.split(',')
        for s in skills:
            s = s.lower()
            if s in all_skills:
                all_skills[s] += 1
            elif s not in stopwords:
                all_skills[s] = 1

    all_skills =  [(s,all_skills[s]) for s in all_skills]
    all_skills.sort(key = lambda x : x[1], reverse=True)
    return all_skills

In [17]:
# Top skills overall
def get_skills_word(description):
    all_skills = {}
    for d in description:
        skills = re.sub(',',' ',d).lower().split(' ')
        for s in skills:
            if s in all_skills:
                all_skills[s] += 1
            elif s not in stopwords and s != '':
                all_skills[s] = 1

    all_skills =  [(s,all_skills[s]) for s in all_skills]
    all_skills.sort(key = lambda x : x[1], reverse=True)
    return all_skills

In [20]:
all_skills = get_skills(data['Description'])
all_skills[:25]

[(' python', 20),
 (' machine learning', 20),
 (' computer engineering', 19),
 (' pytorch', 15),
 (' computer science', 14),
 ('', 14),
 ('machine learning', 11),
 (' c++', 11),
 (' data science', 10),
 (' tensorflow', 10),
 (' deep learning', 10),
 (' communication skills', 10),
 (' or equivalent practical experience', 10),
 ("bachelor's degree in computer science", 10),
 (' c/c++', 9),
 (' java', 9),
 (' or relevant technical field', 9),
 (' multimodal neurons', 9),
 (' scaling laws', 9),
 (' computer vision', 8),
 (' relevant technical field', 8),
 (' circuit-based interpretability', 8),
 (' ai & compute', 8),
 (' concrete problems in ai safety', 8),
 (' reinforcement learning', 7)]

In [21]:
all_skills = get_skills_word(data['Description'])
all_skills[:25]

[('experience', 302),
 ('learning', 187),
 ('computer', 139),
 ('skills', 139),
 ('machine', 111),
 ('science', 103),
 ('job', 83),
 ('research', 81),
 ('years', 70),
 ('ai', 70),
 ('description', 70),
 ('python', 61),
 ('data', 60),
 ('technical', 60),
 ('deep', 58),
 ('required', 55),
 ('engineering', 52),
 ('development', 49),
 ('field', 49),
 ('related', 48),
 ('programming', 47),
 ('degree', 47),
 ('strong', 47),
 ('software', 46),
 ('list', 43)]