In [1]:
import json
import pandas as pd
import re
from bs4 import BeautifulSoup, NavigableString, Tag


valid_jobs_count = 0
jsons_processed = 0
jobs_processed = 0

jobs = []

def get_title(data):
    title = None
    if('position' in data and 'name' in data['position']):
        title = data['position']['name']
    elif ('name' in data):
        title = data['name']
    return title.strip()

def get_company(data):
    company = None
    if ('orgCompany' in data):
        if('nameOrg' in data['orgCompany']):
            company = data['orgCompany']['nameOrg']
        elif('name' in data['orgCompany']):
            company = data['orgCompany']['name']
    return company.strip()

def get_location(data):
    location = None
    cityStateCountry = ""

    # Try 'jobLocation' first.
    if('jobLocation' in data):
        location = data['jobLocation']

    # Try 'location' next
    elif('location' in data):
        location = data['location']

    # If that doesn't exist, try to see if any location exists under 'orgAddress'.
    elif('orgAddress' in data):
        if('addressLine' in data['orgAddress']):
            location = data['orgAddress']['addressLine']
        # If this fails as well, just use the city, state and country of organization.
        if('city' in data['orgAddress']):
            cityStateCountry += data['orgAddress']['city'].strip()
        if('state' in data['orgAddress']):
            cityStateCountry += " " + data['orgAddress']['state'].strip()
        if('country' in data['orgAddress']):
            cityStateCountry += " " + data['orgAddress']['country'].strip()
    
    if(location == None and cityStateCountry != ""):
        location = cityStateCountry

    return location.strip()



def clean_text(text):
    text = text.replace('\n', ' ')                # remove newline
    text = BeautifulSoup(text, "lxml").get_text() # remove html
    text = text.replace('/', ' ')                 # remove forward slashes
    text = re.sub(r'[^a-zA-Z ^0-9]', '', text)    # letters and numbers only
    text = text.lower()                           # lower case
    text = re.sub(r'(x.[0-9])', '', text)         # remove special characters
    return text

def get_description(data):
    description = 'None'
    if('text' in data):
        description = clean_text(data['text'])
    elif('html' in data):
        description = clean_text(data['html'])
    return description


def get_skillsandresposibilitites(data):

    skills_List=['skills','Expertise','Strong understanding']
    jobdesc_list=['Job Description']
    resposibility_List=['PRIMARY RESPONSIBILITIES','Responsibilities']
    if 'html' in data:
        # print(f"Raw Data: {data['html']}")
        soup = BeautifulSoup(data['html'],"html.parser")
        skill_text = ''
        resposibility_text=''
        jobdescription_text=''
        for child in soup.find_all('p'):
            if isinstance(child, NavigableString):
                if any(t in child.text for t in skills_List):
                    skill_text += child.text.strip()                        
           

            elif isinstance(child, Tag):
                # if child.name != 'br':
                if any(t in child.text for t in skills_List):
                    skill_text += child.text.strip()                        
           

        return skill_text

def get_responsibilies(data):

    resposibility_List=['PRIMARY RESPONSIBILITIES','Responsibilities']
    if 'html' in data:
        # print(f"Raw Data: {data['html']}")
        soup = BeautifulSoup(data['html'],"html.parser")
 
        resposibility_text=''
       
        for child in soup.find_all('p'):
            if isinstance(child, NavigableString):
                                       
                if any(t in child.text for t in resposibility_List):
                    resposibility_text = child.next_sibling.text
                
            elif isinstance(child, Tag):
                # if child.name != 'br':
                                      
                if any(t in child.text for t in resposibility_List):
                    if child.next_sibling is not None:
                        resposibility_text = child.next_sibling.text

                        return resposibility_text       
                
def get_url(data):
    url = None
    if('url' in data):
        url = data['url']
    return url

if __name__ =="__main__":

    FILENAME = 'techmap-jobs-dump-2021-09.json'

    # The following job titles/roles will be considered:
    ROLES = ['machine learning', 'data scientist', 'data science', 'deep learning', 'artificial intelligence']

    # Since the json file from the Kaggle dataset is huge (50GB), we have to read one json object at a time.
    # Reading it this way will help us manage memory better by loading a small amount of data into memory at a time.
    with open(FILENAME, 'r') as f:
        for line in f:
            data = json.loads(line)
            if(('position' in data and 'name' in data['position']) or ('name' in data)):
                job_title = get_title(data)
                jobs_processed += 1
                if any(title in job_title.lower() for title in ROLES):
                    valid_jobs_count = valid_jobs_count + 1
                    job = {
                        "title": job_title,
                        "company": get_company(data),
                        "location": get_location(data),
                        "description": get_description(data),
                        "Skills": get_skillsandresposibilitites(data),
                        "Responsibilities":get_responsibilies(data),
                        "url": get_url(data)
                    }
                    jobs.append(job) 



    df = pd.DataFrame(jobs)
    df.to_csv('jobs.csv', index=False)

#TODO
# Implement low-salary, high-salary, and counts (freq)
# Most frequently asked skills (in desc order) as requested

  text = BeautifulSoup(text, "lxml").get_text() # remove html


In [3]:
import spacy
from spacy.tokenizer import Tokenizer
from collections import Counter


nlp = spacy.load("en_core_web_lg")

# Initialize the tokenizer
tokenizer = Tokenizer(nlp.vocab)
STOP_WORDS = nlp.Defaults.stop_words

# Tokenizer pipe removing stop words and blank words
tokens = []

for doc in tokenizer.pipe(df['description'], batch_size=500):
    doc_tokens = []
    for token in doc:
      
        if (token.text not in STOP_WORDS) & (token.text != ' '):
            doc_tokens.append(token.text)

    tokens.append(doc_tokens)

df['tokens'] = tokens

# Tech terms list
tech_terms = ['python', 'r', 'sql', 'hadoop', 'spark', 'tableau','crm','analytical','programming',              
                'hive', 'c','java', 'c++', 'matlab', 'tensorflow', 'excel','problem','solving'
               'nosql', 'scikit','ml', 'communication','statistic',
               'ai', 'artificial', 'intelligence', 'dl', 'nlp',                 
                'nn', 'mathematic', 'mathematics' 'database',                 
               'big data', 'probability', 'api', 'pandas',                
               'numpy', 'bayesian', 'calculus', 'linear', 'algebra', 'statistics','probability',               
                'algorithms','agile','aws','microsoft','azure','cloud','apache']

df['tokens_filtered'] = df.apply(lambda x: list(set(x['tokens']) & set(tech_terms)), axis=1)
df.drop('tokens', axis = 1,  inplace=True)
df.to_csv('jobs3.csv')

df3 = pd.DataFrame()
for index,row in df.iterrows():
    if not "[]" in row['tokens_filtered']:
        df3 = df3.append(pd.Series(row),ignore_index=True)
       
df3 = df3.dropna(subset=['location'])

def populate_df(title, location):
    j_title = df3['title'] == title #Returns True if match
    j_location = df3['location'] == location #Returns True if match
    subset_df = df3[j_title & j_location]
    subset_df = subset_df.reset_index()
    subset_df['tokens_filtered'] = subset_df['tokens_filtered'].astype('str')
    x = subset_df['tokens_filtered'].str.split()
    skills_list=[]
    if not x.empty:
        for y in x[0]:
            y = y.strip()
            y = re.sub(r'[^a-zA-Z]', '', y) 
            skills_list.append(y)


    word_counts = Counter(skills_list)

    return word_counts,skills_list

if __name__=="__main__":

    final_df = pd.DataFrame(columns=['Title', 'Location', 'Skills'])

    appears_in = Counter()
    for index,row in df3.iterrows():
        word_counts,skills_list = populate_df(row['title'],row['location'])
        appears_in.update(word_counts)

        data_df = {'Title': row['title'],
            'Location': row['location'],
            'Skills': skills_list}
        final_df = final_df.append(data_df, ignore_index=True)
    

    temp = zip(appears_in.keys(), appears_in.values())
    wc = pd.DataFrame(temp, columns = ['word', 'count'])
    wc['rank'] = wc['count'].rank(method='first', ascending=False)
    wc = wc.sort_values(by='rank')
    wc.to_csv('PrioritySkills.csv',index=False)
    
    final_df.head()
    final_df.to_csv('Job_Skills.csv',index=False)


  df3 = df3.append(pd.Series(row),ignore_index=True)
  final_df = final_df.append(data_df, ignore_index=True)
  final_df = final_df.append(data_df, ignore_index=True)
  final_df = final_df.append(data_df, ignore_index=True)
  final_df = final_df.append(data_df, ignore_index=True)
  final_df = final_df.append(data_df, ignore_index=True)
  final_df = final_df.append(data_df, ignore_index=True)
  final_df = final_df.append(data_df, ignore_index=True)
  final_df = final_df.append(data_df, ignore_index=True)
  final_df = final_df.append(data_df, ignore_index=True)
  final_df = final_df.append(data_df, ignore_index=True)
  final_df = final_df.append(data_df, ignore_index=True)
  final_df = final_df.append(data_df, ignore_index=True)
  final_df = final_df.append(data_df, ignore_index=True)
  final_df = final_df.append(data_df, ignore_index=True)
  final_df = final_df.append(data_df, ignore_index=True)
  final_df = final_df.append(data_df, ignore_index=True)
  final_df = final_df.append(data_d

In [16]:
final_df.to_csv('jobs2.csv', index=False)