In [61]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [62]:
#extracting the raw file
raw_txt = None
with open('jd_scraping/output.csv','r') as f:
    raw_text = f.read()

rows = raw_text.split(';')
rows = [row.split('|') for row in rows if row != '']
print(rows[:5])

[['Srno', 'Company', 'Role', 'Description', 'Link'], ['1', 'Apple', 'Machine Learning Software Engineer', 'Here is the comma-separated list of required skills and experience:\n\nbiometrics/face tracking/gaze tracking, modern C++ & Python, algorithm implementation and optimization with platform constraints, computer vision / machine learning product development and delivery, working with sophisticated software stacks, 3 years of professional software development experience.', 'https://jobs.apple.com/en-us/details/200540712/machine-learning-software-engineer'], ['2', 'Apple', 'Mixed-Signal IP Machine Learning Engineer', 'Machine learning, Python, C, VLSI fundamentals, signal processing, logistic regression, deep neural networks, reinforcement learning, algorithms, data structures, math background, firmware development, system architecture, validation, cross-functional team collaboration.', 'https://jobs.apple.com/en-us/details/200488665/mixed-signal-ip-machine-learning-engineer'], ['3', 

In [63]:
#validating that all the rows have exactly 5 columns
max([len(r) for r in rows]), min([len(r) for r in rows])

(5, 5)

In [64]:
headers = rows[0]
rows = rows[1:]
print(rows[0])
data = pd.DataFrame(rows,columns=headers)
print(data.shape)
data.head()

['1', 'Apple', 'Machine Learning Software Engineer', 'Here is the comma-separated list of required skills and experience:\n\nbiometrics/face tracking/gaze tracking, modern C++ & Python, algorithm implementation and optimization with platform constraints, computer vision / machine learning product development and delivery, working with sophisticated software stacks, 3 years of professional software development experience.', 'https://jobs.apple.com/en-us/details/200540712/machine-learning-software-engineer']
(165, 5)


Unnamed: 0,Srno,Company,Role,Description,Link
0,1,Apple,Machine Learning Software Engineer,Here is the comma-separated list of required s...,https://jobs.apple.com/en-us/details/200540712...
1,2,Apple,Mixed-Signal IP Machine Learning Engineer,"Machine learning, Python, C, VLSI fundamentals...",https://jobs.apple.com/en-us/details/200488665...
2,3,Apple,Senior Machine Learning Software Engineer,Here is the list of required skills and experi...,https://jobs.apple.com/en-us/details/200493299...
3,4,Apple,Pre-Silicon Machine Learning Compiler Engineer,Here is the list of required skills and experi...,https://jobs.apple.com/en-us/details/200520036...
4,5,Apple,Pre-Silicon Machine Learning Compiler Engineer,Here is the list of required skills and experi...,https://jobs.apple.com/en-us/details/200520040...


In [65]:
# Visualiizing the most common roles in the sample
data['Role'].value_counts()[:10]

Role
Senior Machine Learning Engineer                           3
Machine Learning Intern                                    2
Research Scientist                                         2
Pre-Silicon Machine Learning Compiler Engineer             2
Research Engineer                                          2
Mixed-Signal IP Machine Learning Engineer                  2
Research Engineer, Interpretability                        2
Machine Learning GPU Performance Engineer                  2
Product Engineer, Machine Learning and GPU Accelerators    2
Research Engineer, Knowledge Bases                         2
Name: count, dtype: int64

In [66]:
data.head()

Unnamed: 0,Srno,Company,Role,Description,Link
0,1,Apple,Machine Learning Software Engineer,Here is the comma-separated list of required s...,https://jobs.apple.com/en-us/details/200540712...
1,2,Apple,Mixed-Signal IP Machine Learning Engineer,"Machine learning, Python, C, VLSI fundamentals...",https://jobs.apple.com/en-us/details/200488665...
2,3,Apple,Senior Machine Learning Software Engineer,Here is the list of required skills and experi...,https://jobs.apple.com/en-us/details/200493299...
3,4,Apple,Pre-Silicon Machine Learning Compiler Engineer,Here is the list of required skills and experi...,https://jobs.apple.com/en-us/details/200520036...
4,5,Apple,Pre-Silicon Machine Learning Compiler Engineer,Here is the list of required skills and experi...,https://jobs.apple.com/en-us/details/200520040...


In [67]:
# creating the following job categories: manager, ml engineer, software engineer and scientist using keywords from the job title
import re
def get_category(role):
    category = 'other'
    role = role.lower()
    role = re.sub(' ', '', role)
    if 'lead' in role or 'leader' in role or 'manager' in role:
        category = 'manager'
    elif 'engineer' in role or 'architect' in role:
        category = 'ml engineer' if 'ml' in role or 'learning' in role or 'research' in role else 'software engineer'
    elif 'research' in role or 'scientist' in role or 'science' in role:
        category = 'scientist'
    
    
    return category

data['Category'] = data['Role'].apply(get_category)
data[['Role','Category']].iloc[:20,:]


Unnamed: 0,Role,Category
0,Machine Learning Software Engineer,ml engineer
1,Mixed-Signal IP Machine Learning Engineer,ml engineer
2,Senior Machine Learning Software Engineer,ml engineer
3,Pre-Silicon Machine Learning Compiler Engineer,ml engineer
4,Pre-Silicon Machine Learning Compiler Engineer,ml engineer
5,Mixed-Signal IP Machine Learning Engineer,ml engineer
6,"Senior Machine Learning Engineer, Health",ml engineer
7,"AIML - Sr Machine Learning Engineer, Data & Ma...",ml engineer
8,"AIML - Machine Learning Engineer or Scientist,...",ml engineer
9,Computer Vision and Machine Learning Engineer,ml engineer


In [68]:
#removing stock response of LLM:
def remove_stock_response(description):
    desc = re.sub('Here is the comma-separated list of required skills and experience:\n\n','',description)
    desc = re.sub('Here is the list of required skills and experience:\n\n','',desc)
    desc = re.sub('Here is the list of required skills and experience:\n\n','',desc)
    desc = re.sub('I apologize*','',desc)
    desc = re.sub('\n','',desc)
    return desc

data['Description'] = data['Description'].apply(remove_stock_response)

In [69]:
#processing stopwords
import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [70]:
# to analyze occurance of phrases (skills)

# Top skills overall
def get_skills(description):
    all_skills = {}
    for d in description:
        skills = d.split(',')
        for s in skills:
            s = s.strip().lower()
            if s == '':
                continue
            if s in all_skills:
                all_skills[s] += 1
            elif s not in stopwords:
                all_skills[s] = 1

    all_skills =  [(s,all_skills[s]) for s in all_skills]
    all_skills.sort(key = lambda x : x[1], reverse=True)
    return all_skills

In [71]:
# to analyze occurance of individual words in the job description

# Top skills overall
def get_skills_word(description):
    all_skills = {}
    for d in description:
        skills = re.sub(',',' ',d).lower().split(' ')
        for s in skills:
            if s == '':
                continue
            if s in all_skills:
                all_skills[s] += 1
            elif s not in stopwords and s != '':
                all_skills[s] = 1

    all_skills =  [(s,all_skills[s]) for s in all_skills]
    all_skills.sort(key = lambda x : x[1], reverse=True)
    return all_skills

In [72]:
all_skills = get_skills(data['Description'])
all_skills[:5]

[('machine learning', 31),
 ('python', 23),
 ('computer engineering', 19),
 ("bachelor's degree in computer science", 16),
 ('pytorch', 15)]

In [73]:
#OPTIONAL: Save the skills list to disk

import pickle
all_skills = get_skills(data['Description'])
with open('skills.bin', 'wb') as file:
    pickle.dump(all_skills, file)


In [74]:
all_skills = get_skills_word(data['Description'])
all_skills[:5]

[('experience', 302),
 ('learning', 187),
 ('computer', 139),
 ('skills', 139),
 ('machine', 111)]

## Category-wise skills

In [75]:
all_skills = get_skills(data[data['Category'] == 'ml engineer']['Description'])
all_skills[:10]

[('machine learning', 18),
 ('pytorch', 14),
 ('python', 11),
 ('tensorflow', 8),
 ('deep learning', 7),
 ('jax', 6),
 ('c/c++', 5),
 ('debugging', 5),
 ('java', 5),
 ('communication skills', 5)]

In [76]:
all_skills = get_skills(data[data['Category'] == 'software engineer']['Description'])
all_skills[:10]

[('python', 5),
 ('machine learning', 4),
 ('spark', 3),
 ("bachelor's degree in computer science", 2),
 ('c++', 2),
 ('computer engineering', 2),
 ('computer vision', 2),
 ('deep learning', 2),
 ('benefits', 2),
 ('physics', 2)]

In [77]:
all_skills = get_skills(data[data['Category'] == 'scientist']['Description'])
all_skills[:10]

[('computer engineering', 10),
 ("bachelor's degree in computer science", 8),
 ('or equivalent practical experience', 7),
 ('machine learning', 6),
 ('c++', 6),
 ('python', 5),
 ('relevant technical field', 5),
 ('computer science', 4),
 ('statistics', 4),
 ('or relevant technical field', 4)]

In [78]:
all_skills = get_skills(data[data['Category'] == 'manager']['Description'])
all_skills[:10]

[('machine learning', 3),
 ('data science', 3),
 ('computer science', 3),
 ("bachelor's degree or equivalent practical experience", 2),
 ('python', 2),
 ('ai research', 2),
 ('leadership', 2),
 ('communication', 2),
 ("bachelor's degree in computer science", 2),
 ('computer engineering', 2)]

## Category-wise word skills

In [79]:
all_skills = get_skills_word(data[data['Category'] == 'ml engineer']['Description'])
all_skills[:10]

[('experience', 129),
 ('learning', 104),
 ('machine', 66),
 ('skills', 64),
 ('computer', 60),
 ('science', 43),
 ('years', 39),
 ('job', 33),
 ('python', 31),
 ('development', 30)]

In [80]:
all_skills = get_skills_word(data[data['Category'] == 'software engineer']['Description'])
all_skills[:10]

[('experience', 48),
 ('skills', 32),
 ('job', 24),
 ('learning', 19),
 ('description', 19),
 ('computer', 17),
 ('science', 15),
 ('required', 13),
 ('software', 13),
 ('years', 11)]

In [81]:
all_skills = get_skills_word(data[data['Category'] == 'scientist']['Description'])
all_skills[:10]

[('experience', 77),
 ('computer', 47),
 ('learning', 45),
 ('science', 30),
 ('research', 25),
 ('machine', 24),
 ('field', 21),
 ('python', 18),
 ('skills', 18),
 ('phd', 17)]

In [82]:
all_skills = get_skills_word(data[data['Category'] == 'manager']['Description'])
all_skills[:10]

[('experience', 43),
 ('job', 17),
 ('research', 16),
 ('technical', 15),
 ('learning', 14),
 ('skills', 14),
 ('ai', 13),
 ('description', 12),
 ('machine', 11),
 ('science', 11)]