In [149]:
import re
import dateutil
import spacy
import nltk

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm, tqdm_notebook
from datetime import datetime
from dateutil.parser import parse
from nltk.tokenize import word_tokenize
from helpers import download_dataset

In [2]:
URL = 'https://www.kaggle.com/madhab/jobposts/'
DEST = './data'
download_dataset(URL, './data')
df = pd.read_csv('./data/%s' % 'data job posts.csv', encoding='utf-8')



In [3]:
def clean(text):
    newline_spam = r'([\s]*)(\r\n)([^-])'
    space_spam = r'([ ]{2,}|[\s]{3,})'
    tail = r'([\s]+[-]{2,})'
    cleaned = re.sub(newline_spam, r'\1 \3',
                     text).strip(' ').replace(';', '.')
    cleaned = re.sub(space_spam, ' ', cleaned)
    cleaned = re.split(tail, cleaned)[0].strip()
    return cleaned

def clean_key(text):
    return re.sub(r'[\s]+', ' ', text).strip()

def make_dict(text):
    pattern = r'\r\n([A-Z\s]+):'
    data = re.split(pattern, text)
    info = {'COMPANY': data[0]}
    info.update({clean_key(data[i]):clean(data[i+1]) 
                 for i in range(1, len(data), 2)})
    return info

In [136]:
jobpost = [make_dict(job) for job in df.jobpost]

In [137]:
COLS = ['JOB_TITLE', 'POSITION_DURATION', 'POSITION_LOCATION',
           'JOB_DESCRIPTION', 'JOB_RESPONSIBILITIES',
           'REQUIRED_QUALIFICATIONS', 'REMUNERATION',
           'APPLICATION_DEADLINE', 'ABOUT_COMPANY']
PATTERNS = ['JOB TITLE|TITLE$', 'POSITION DURATION|DURATION$',
           'POSITION LOCATION|LOCATION$',
           'JOB DESCRIPTION|DESCRIPTION$',
           'JOB RESPONSIBILITIES|RESPONSIBILITIES$',
           'REQUIRED QUALIFICATIONS|QUALIFICATIONS$',
           'REMUNERATION$',
           'APPLICATION DEADLINE|DEADLINE$',
           'ABOUT COMPANY']
jobpost_df = pd.DataFrame(jobpost)

def normalize_df(df, cols=COLS, patterns=PATTERNS):
    df = df.fillna(value='')
    new_df = pd.DataFrame(columns=cols)
    for col, pattern in zip(cols, patterns):
        selected = df.filter(regex=pattern).columns
        new_df[col] = df[selected[0]].str.cat(df[selected[1:]], 
                                         sep=' ')
        new_df = new_df.apply(lambda x: x.str.strip(' '))
    return new_df

In [138]:
jobpost_df = normalize_df(jobpost_df)
jobpost_df.to_csv('./data/jobpost.csv',
                  encoding='utf-8', index=False)
jobpost_df.head()

Unnamed: 0,JOB_TITLE,POSITION_DURATION,POSITION_LOCATION,JOB_DESCRIPTION,JOB_RESPONSIBILITIES,REQUIRED_QUALIFICATIONS,REMUNERATION,APPLICATION_DEADLINE,ABOUT_COMPANY
0,Chief Financial Officer,,"Yerevan, Armenia",AMERIA Investment Consulting Company is seekin...,- Supervises financial management and administ...,"To perform this job successfully, an individua...",,26 January 2004,
1,Full-time Community Connections Intern (paid i...,3 months,"IREX Armenia Main Office. Yerevan, Armenia",IREX currently seeks to fill the position of a...,- Presenting the CC program to interested part...,- Bachelor's Degree. Master's is preferred.\r\...,Commensurate with experience.,12 January 2004,The International Research & Exchanges Board (...
2,Country Coordinator,Renewable annual contract,"Yerevan, Armenia",Public outreach and strengthening of a growing...,- Working with the Country Director to provide...,"- Degree in environmentally related field, or ...",Salary commensurate with experience.,20 January 2004,The Caucasus Environmental NGO Network is a no...
3,BCC Specialist,,"Manila, Philippines",The LEAD (Local Enhancement and Development fo...,- Identify gaps in knowledge and overseeing in...,"- Advanced degree in public health, social sci...",,23 January 2004,
4,Software Developer,,"Yerevan, Armenia",,- Rendering technical assistance to Database M...,- University degree. economical background is ...,Will be commensurate with the norms accepted i...,"20 January 2004, 18:00",


In [139]:
max_year = df.Year.max()
min_year = max_year - 2

print('Company with most job ads in the past 2 years: {}'.format(
        df[df['Year'] >= min_year].Company.mode().values[0]))

Company with most job ads in the past 2 years: ArmenTel CJSC


In [140]:
months = ['January', 'February', 'March', 'April', 'May', 'June',
         'July', 'August', 'September', 'October', 'November',
         'December']
print('Month with largest number of job ads: {}'.format(
        months[df.Month.value_counts().idxmax()-1]))

Month with largest number of job ads: March


In [141]:
nlp = spacy.load('en_core_web_sm')
text = 'I am sitting on a chair, listening to my favourite Axwell'

In [142]:
def tokenize(text):
    return nlp(text)

def lemmatization(token):
    return token.lemma_

def process(tokens):
    return ' '.join([lemmatization(token) for token in tokens
                    if not token.is_stop])

def clean_sentence(text):
    sentence = tokenize(text)
    sentence = process(sentence)
    return sentence

In [153]:
tqdm_notebook().pandas()
final_df = jobpost_df.copy()
final_df.JOB_RESPONSIBILITIES = jobpost_df.JOB_RESPONSIBILITIES.progress_apply(
                                    lambda x: [clean_sentence(line.strip())
                                               for line in x.split('-')
                                               if line])

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=19001), HTML(value='')))

In [160]:
final_df.loc[final_df.POSITION_DURATION=='', 'POSITION_DURATION'] = 'unavailable'
final_df.head()

Unnamed: 0,JOB_TITLE,POSITION_DURATION,POSITION_LOCATION,JOB_DESCRIPTION,JOB_RESPONSIBILITIES,REQUIRED_QUALIFICATIONS,REMUNERATION,APPLICATION_DEADLINE,ABOUT_COMPANY
0,Chief Financial Officer,unavailable,"Yerevan, Armenia",AMERIA Investment Consulting Company is seekin...,[supervise financial management administrative...,"To perform this job successfully, an individua...",,26 January 2004,
1,Full-time Community Connections Intern (paid i...,3 months,"IREX Armenia Main Office. Yerevan, Armenia",IREX currently seeks to fill the position of a...,"[present CC program interested party ., assist...",- Bachelor's Degree. Master's is preferred.\r\...,Commensurate with experience.,12 January 2004,The International Research & Exchanges Board (...
2,Country Coordinator,Renewable annual contract,"Yerevan, Armenia",Public outreach and strengthening of a growing...,[work Country Director provide environmental i...,"- Degree in environmentally related field, or ...",Salary commensurate with experience.,20 January 2004,The Caucasus Environmental NGO Network is a no...
3,BCC Specialist,unavailable,"Manila, Philippines",The LEAD (Local Enhancement and Development fo...,[identify gap knowledge oversee information co...,"- Advanced degree in public health, social sci...",,23 January 2004,
4,Software Developer,unavailable,"Yerevan, Armenia",,[render technical assistance Database Manageme...,- University degree. economical background is ...,Will be commensurate with the norms accepted i...,"20 January 2004, 18:00",


In [161]:
final_df.to_csv('./data/jobpost_s3.csv', index=False, encoding='utf-8')