In [None]:
import json
import pandas as pd
import re
from bs4 import BeautifulSoup

FILENAME = 'techmap-jobs-dump-2021-09.json'

# The following job titles/roles will be considered:
ROLES = ['machine learning', 'data scientist', 'data science', 'deep learning', 'artificial intelligence']

# Since the json file from the Kaggle dataset is huge (50GB), we have to read one json object at a time.
# Reading it this way will help us manage memory better by loading a small amount of data into memory at a time.
valid_jobs_count = 0
jsons_processed = 0
jobs_processed = 0

jobs = []

def get_title(data):
    title = None
    if('position' in data and 'name' in data['position']):
        title = data['position']['name']
    elif ('name' in data):
        title = data['name']
    return title.strip()

def get_company(data):
    company = None
    if ('orgCompany' in data):
        if('nameOrg' in data['orgCompany']):
            company = data['orgCompany']['nameOrg']
        elif('name' in data['orgCompany']):
            company = data['orgCompany']['name']
    return company.strip()

def get_location(data):
    location = None
    cityStateCountry = ""

    # Try 'jobLocation' first.
    if('jobLocation' in data):
        location = data['jobLocation']

    # Try 'location' next
    elif('location' in data):
        location = data['location']

    # If that doesn't exist, try to see if any location exists under 'orgAddress'.
    elif('orgAddress' in data):
        if('addressLine' in data['orgAddress']):
            location = data['orgAddress']['addressLine']
        # If this fails as well, just use the city, state and country of organization.
        if('city' in data['orgAddress']):
            cityStateCountry += data['orgAddress']['city'].strip()
        if('state' in data['orgAddress']):
            cityStateCountry += " " + data['orgAddress']['state'].strip()
        if('country' in data['orgAddress']):
            cityStateCountry += " " + data['orgAddress']['country'].strip()
    
    if(location == None and cityStateCountry != ""):
        location = cityStateCountry

    return location.strip()

def get_salary(data):
    salary = None
    if('salary' in data):
        salary = data['salary']
    return salary

def clean_text(text):
    text = text.replace('\n', ' ')                # remove newline
    text = BeautifulSoup(text, "lxml").get_text() # remove html
    text = text.replace('/', ' ')                 # remove forward slashes
    text = re.sub(r'[^a-zA-Z ^0-9]', '', text)    # letters and numbers only
    text = text.lower()                           # lower case
    text = re.sub(r'(x.[0-9])', '', text)         # remove special characters
    return text

def get_description(data):
    description = 'None'
    if('text' in data):
        description = clean_text(data['text'])
    elif('html' in data):
        description = clean_text(data['html'])
    return description

def get_url(data):
    url = None
    if('url' in data):
        url = data['url']
    return url

with open(FILENAME, 'r') as f:
    for line in f:
        data = json.loads(line)
        jsons_processed += 1
        if(('position' in data and 'name' in data['position']) or ('name' in data)):
            job_title = get_title(data)
            jobs_processed += 1
            if any(title in job_title.lower() for title in ROLES):
                valid_jobs_count = valid_jobs_count + 1
                job = {
                    "title": job_title,
                    "company": get_company(data),
                    "location": get_location(data),
                    "salary": get_salary(data),
                    "description": get_description(data),
                    "url": get_url(data)
                }
                jobs.append(job) 

        # if(jobs_processed == 50000):
        #     break

df = pd.DataFrame(jobs)
df.to_csv('jobs.csv', index=False)

#TODO
# Implement low-salary, high-salary, and counts (freq)
# Most frequently asked skills (in desc order) as requested

In [34]:
import spacy
from spacy.tokenizer import Tokenizer
from collections import Counter

# Initialize the tokenizer
nlp = spacy.load("en_core_web_lg")
tokenizer = Tokenizer(nlp.vocab)
STOP_WORDS = nlp.Defaults.stop_words.union(['year'])

# Tokenizer pipe removing stop words and blank words
tokens = []

for doc in tokenizer.pipe(df['description'], batch_size=500):
    doc_tokens = []
    for token in doc:
        # print('token: ', token) # not empty
        # print('token.lemma: ', token.lemma) # empty
        # print('token.text: ', token.text) # not empty
        if (token.text not in STOP_WORDS) & (token.text != ' '):
            doc_tokens.append(token.text)

    tokens.append(doc_tokens)

df['tokens'] = tokens

# Tech terms list
tech_terms = ['python', 'r', 'sql', 'hadoop', 'spark', 'tableau',
              'hive', 'c', 'c++', 'matlab', 'tensorflow', 'excel',
              'nosql', 'scikit', 'machine learning', 'statistic', 'analysis', 'computer science'
              'ai', 'artificial intelligence', 'dl', 'deep learning', 'nlp', 'natural language processing', 
              'neural network', 'neural networks', 'nn', 'mathematic', 'mathematics' 'database', 
              'sql', 'big data', 'probability', 'api','data science', 'pandas', 
              'numpy', 'bayesian', 'calculus', 'linear algebra', 'statistics',
              'data evaluation', 'data modelling']

df['tokens_filtered'] = df.apply(lambda x: list(set(x['tokens']) & set(tech_terms)), axis=1)
df.drop('tokens', axis = 1,  inplace=True)
# df.head()


# Create a count function
def count(docs):

    word_counts = Counter()
    appears_in = Counter()
        
    total_docs = len(docs)

    for doc in docs:
        word_counts.update(doc)
        appears_in.update(set(doc))

    temp = zip(word_counts.keys(), word_counts.values())
        
    wc = pd.DataFrame(temp, columns = ['word', 'count'])

    wc['rank'] = wc['count'].rank(method='first', ascending=False)
    total = wc['count'].sum()

    wc['pct_total'] = wc['count'].apply(lambda x: x / total)
        
    wc = wc.sort_values(by='rank')
    wc['cul_pct_total'] = wc['pct_total'].cumsum()

    t2 = zip(appears_in.keys(), appears_in.values())
    ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
    wc = ac.merge(wc, on='word')

    wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)
        
    return wc.sort_values(by='rank')


# Load the final dataframe with the approriate data
def populate_df(title, location):
    j_title = df['title'] == title
    j_location = df['location'] == location
    subset_df = df[j_title & j_location]
    subset_df = subset_df.reset_index()
    
    wc = count(subset_df['tokens_filtered'])
    skills = wc['word'][:10]
    
    if subset_df.shape[0] > 0:
        data = {'title': title,
                'location': location,
                'salary': subset_df['salary'],
                'skills': list(skills)}
    else:
        data = {'title': title,
                'location': location,
                'salary': 0,
                'skills': []}
    
    return data

final_df = pd.DataFrame(columns=['title', 'location', 'salary', 'skills'])

locations = df['location']

for location in locations:
    final_df = final_df.append(populate_df('data scientist', location), ignore_index=True)

final_df.head()



  final_df = final_df.append(populate_df('data scientist', location), ignore_index=True)
  final_df = final_df.append(populate_df('data scientist', location), ignore_index=True)
  final_df = final_df.append(populate_df('data scientist', location), ignore_index=True)
  final_df = final_df.append(populate_df('data scientist', location), ignore_index=True)
  final_df = final_df.append(populate_df('data scientist', location), ignore_index=True)
  final_df = final_df.append(populate_df('data scientist', location), ignore_index=True)
  final_df = final_df.append(populate_df('data scientist', location), ignore_index=True)
  final_df = final_df.append(populate_df('data scientist', location), ignore_index=True)
  final_df = final_df.append(populate_df('data scientist', location), ignore_index=True)
  final_df = final_df.append(populate_df('data scientist', location), ignore_index=True)
  final_df = final_df.append(populate_df('data scientist', location), ignore_index=True)
  final_df = final_df

Unnamed: 0,title,location,salary,skills
0,data scientist,"Seattle, WA",0,[]
1,data scientist,"San Jose, CA",0,[]
2,data scientist,"Seattle, WA",0,[]
3,data scientist,"Seattle, WA",0,[]
4,data scientist,"New York, NY",0,[]


In [35]:
final_df.to_csv('jobs2.csv', index=False)