In [1]:
import os
import re
import spacy
import nltk

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm, tqdm_notebook
from datetime import datetime
from nltk.tokenize import word_tokenize
from helpers import download_dataset, S3Client

tqdm_notebook().pandas()



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [2]:
URL = 'https://www.kaggle.com/rishisankineni/text-similarity/data'
DEST = './data'
download_dataset(URL, './data')
df = pd.read_csv('./data/%s' % 'data job posts.csv', encoding='utf-8')



# Cleaning
1. Text cleaning functions
2. Make dict function

In [3]:
def clean(text):
    # to remove newline
    newline_spam = r'([\s]*)(\r\n)([^-])'
    # to remove spaces
    space_spam = r'([ ]{2,}|[\s]{3,})'
    # to remove tail in every jobposts
    tail = r'([\s]+[-]{2,})'
    cleaned = re.sub(newline_spam, r'\1 \3',
                     text).strip(' ').replace(';', '.')
    cleaned = re.sub(space_spam, ' ', cleaned)
    cleaned = re.split(tail, cleaned)[0].strip()
    return cleaned

def clean_key(text):
    return re.sub(r'[\s]+', ' ', text).strip()

def make_dict(text):
    # split the jobpost with colons comes after upper alphabets
    pattern = r'\r\n([A-Z\s]+):'
    data = re.split(pattern, text)
    # make key for company name
    info = {'COMPANY': data[0]}
    info.update({clean_key(data[i]):clean(data[i+1]) 
                 for i in range(1, len(data), 2)})
    return info

# Make a new Dictionary for Jobpost
Loop thru jobpost column and create a dict using make_dict()

In [4]:
jobpost = [make_dict(job) for job in df.jobpost]

In [5]:
# columns required for jobpost df
COLS = ['JOB_TITLE', 'POSITION_DURATION', 'POSITION_LOCATION',
           'JOB_DESCRIPTION', 'JOB_RESPONSIBILITIES',
           'REQUIRED_QUALIFICATIONS', 'REMUNERATION',
           'APPLICATION_DEADLINE', 'ABOUT_COMPANY']

# re pattern for normalizing jobpost df columns
PATTERNS = ['JOB TITLE|TITLE$', 'POSITION DURATION|DURATION$',
           'POSITION LOCATION|LOCATION$',
           'JOB DESCRIPTION|DESCRIPTION$',
           'JOB RESPONSIBILITIES|RESPONSIBILITIES$',
           'REQUIRED QUALIFICATIONS|QUALIFICATIONS$',
           'REMUNERATION$',
           'APPLICATION DEADLINE|DEADLINE$',
           'ABOUT COMPANY']

# create df for jobpost dict
jobpost_df = pd.DataFrame(jobpost)

def normalize_df(df, cols=COLS, patterns=PATTERNS):
    df = df.fillna(value='')
    new_df = pd.DataFrame(columns=cols)
    # normalizing column names with similar meanings
    for col, pattern in zip(cols, patterns):
        selected = df.filter(regex=pattern).columns
        new_df[col] = df[selected[0]].str.cat(df[selected[1:]], 
                                         sep=' ')
        new_df = new_df.apply(lambda x: x.str.strip(' '))
    return new_df

# Update new jobpost_df with cleaned and normalized columns

In [6]:
jobpost_df = normalize_df(jobpost_df)
# backup
jobpost_df.to_csv('./data/jobpost.csv',
                  encoding='utf-8', index=False)
jobpost_df.head()

Unnamed: 0,JOB_TITLE,POSITION_DURATION,POSITION_LOCATION,JOB_DESCRIPTION,JOB_RESPONSIBILITIES,REQUIRED_QUALIFICATIONS,REMUNERATION,APPLICATION_DEADLINE,ABOUT_COMPANY
0,Chief Financial Officer,,"Yerevan, Armenia",AMERIA Investment Consulting Company is seekin...,- Supervises financial management and administ...,"To perform this job successfully, an individua...",,26 January 2004,
1,Full-time Community Connections Intern (paid i...,3 months,"IREX Armenia Main Office. Yerevan, Armenia",IREX currently seeks to fill the position of a...,- Presenting the CC program to interested part...,- Bachelor's Degree. Master's is preferred.\r\...,Commensurate with experience.,12 January 2004,The International Research & Exchanges Board (...
2,Country Coordinator,Renewable annual contract,"Yerevan, Armenia",Public outreach and strengthening of a growing...,- Working with the Country Director to provide...,"- Degree in environmentally related field, or ...",Salary commensurate with experience.,20 January 2004,The Caucasus Environmental NGO Network is a no...
3,BCC Specialist,,"Manila, Philippines",The LEAD (Local Enhancement and Development fo...,- Identify gaps in knowledge and overseeing in...,"- Advanced degree in public health, social sci...",,23 January 2004,
4,Software Developer,,"Yerevan, Armenia",,- Rendering technical assistance to Database M...,- University degree. economical background is ...,Will be commensurate with the norms accepted i...,"20 January 2004, 18:00",


# Retrieve the latest/ past two years
1. Get max year
2. Make min year to be 2 years before max year
3. Collect company name with most ads in past 2 years

In [7]:
max_year = df.Year.max()
min_year = max_year - 2

print('Company with most job ads in the past 2 years: {}'.format(
        df[df['Year'] >= min_year].Company.mode().values[0]))

Company with most job ads in the past 2 years: ArmenTel CJSC


# Most Job Ads by Months
1. Separate df into months
2. Calculate max number of ads counts
3. Retrieve df index of the max

In [8]:
months = ['January', 'February', 'March', 'April', 'May', 'June',
         'July', 'August', 'September', 'October', 'November',
         'December']
print('Month with largest number of job ads: {}'.format(
        months[df.Month.value_counts().idxmax()-1]))

Month with largest number of job ads: March


# Cleaning Job Responsibilites Column
1. Initialize spaCy nlp model
2. Make functions for tokenization, lemmatization and stopwords removing
3. Then clean the JOB_RESPONSIBILTIES column

In [9]:
nlp = spacy.load('en_core_web_sm')

In [10]:
def tokenize(text):
    return nlp(text)

def lemmatization(token):
    return token.lemma_

def process(tokens):
    return ' '.join([lemmatization(token) for token in tokens
                    if not token.is_stop])

def clean_sentence(text):
    sentence = tokenize(text)
    sentence = process(sentence)
    return sentence

In [11]:
final_df = jobpost_df.copy()
final_df.JOB_RESPONSIBILITIES = jobpost_df.JOB_RESPONSIBILITIES.progress_apply(
                                    lambda x: [clean_sentence(line.strip())
                                               for line in x.split('-')
                                               if line])

HBox(children=(IntProgress(value=0, max=19001), HTML(value='')))




In [12]:
final_df.loc[final_df.POSITION_DURATION=='', 'POSITION_DURATION'] = 'unavailable'
final_df.head()

Unnamed: 0,JOB_TITLE,POSITION_DURATION,POSITION_LOCATION,JOB_DESCRIPTION,JOB_RESPONSIBILITIES,REQUIRED_QUALIFICATIONS,REMUNERATION,APPLICATION_DEADLINE,ABOUT_COMPANY
0,Chief Financial Officer,unavailable,"Yerevan, Armenia",AMERIA Investment Consulting Company is seekin...,[supervise financial management administrative...,"To perform this job successfully, an individua...",,26 January 2004,
1,Full-time Community Connections Intern (paid i...,3 months,"IREX Armenia Main Office. Yerevan, Armenia",IREX currently seeks to fill the position of a...,"[present CC program interested party ., assist...",- Bachelor's Degree. Master's is preferred.\r\...,Commensurate with experience.,12 January 2004,The International Research & Exchanges Board (...
2,Country Coordinator,Renewable annual contract,"Yerevan, Armenia",Public outreach and strengthening of a growing...,[work Country Director provide environmental i...,"- Degree in environmentally related field, or ...",Salary commensurate with experience.,20 January 2004,The Caucasus Environmental NGO Network is a no...
3,BCC Specialist,unavailable,"Manila, Philippines",The LEAD (Local Enhancement and Development fo...,[identify gap knowledge oversee information co...,"- Advanced degree in public health, social sci...",,23 January 2004,
4,Software Developer,unavailable,"Yerevan, Armenia",,[render technical assistance Database Manageme...,- University degree. economical background is ...,Will be commensurate with the norms accepted i...,"20 January 2004, 18:00",


In [13]:
# backup
final_df.to_csv('./data/jobpost_s3.csv', index=False, encoding='utf-8')

In [14]:
key = os.environ.get('AWS_ACCESS_KEY_ID')
secret = os.environ.get('AWS_SECRET_ACCESS_KEY')
file = './data/jobpost_s3.csv'
s3 = S3Client(key, secret)

In [17]:
response = s3.upload_file(bucket='seekk',
                         src=file,
                         dest='data/jobpost_s3.csv')

if response:
    print('%s uploaded successfully!' % file)
else:
    print('%s failed to upload!' % file)

./data/jobpost_s3.csv uploaded successfully!
