# Get Clean Merged and Seperate PL Data

In [1]:
import os, json
import numpy as np
from collections import Counter, defaultdict
from pl_module import get_pl_keywords, all_pls

In [2]:
# ['jobID', 'jobTitle', 'jobEmployer', 'jobLocation', 'jobPostTime', 'skills', 'employmentType', 'baseSalary', 'jobDescription', 'url']
def read_dir_json(raw_dir, extract_entry='jobDescription', limit=None):
    '''
    iterate all files under this dir path and retrieve each jobDescription
    
    return { file: [[], [], []] }, each file (field) and its posts' jobDescription
    '''
    files_content = defaultdict(lambda: [])

    for i, file in enumerate(os.listdir(raw_dir)):
        if file.startswith("."): continue # skip .ipynb_checkpoints/

        content = json.loads(open(raw_dir + file).read())
        all_posts = sum(content.values(), []) # concat all posts (dict.values())
        all_posts = all_posts[:limit] if limit else all_posts
        
        print(file, len(all_posts))
        for post in all_posts:
            keywords = list(map(lambda el: el.replace(' ', '_'), get_pl_keywords(post[extract_entry])))
            files_content[file].append(keywords) # 保留每個領域的每個 post 結構及順序 { field: [[], [], []] }
            
    return files_content

def to_data(raw):
    X, y, labels = [], [], {}
    for i, field in enumerate(raw):
        labels[i] = field
        for each in raw[field]:
            X.append(each)
            y.append(i)
    return X, y, labels

# Convert to Vector

In [3]:
from pyfasttext import FastText
model = FastText('../wiki.en.bin')

def get_ctr(X, y):
    total_ctr, fields_ctr = Counter(), defaultdict(lambda: Counter())
    for x, label in zip(X, y):
        ctr = Counter(x)
        fields_ctr[label] += ctr
        total_ctr += ctr
    return total_ctr, fields_ctr

# Seperate Data, Train and Predict

In [4]:
from sklearn import svm, metrics
from sklearn.model_selection import train_test_split

def split_data(X, y, ratio=0.2):
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2)

    print("Training data length: {}, Test data length: {}".format(len(train_X), len(test_X)))
    
    return train_X, test_X, train_y, test_y

def train_and_predict(train_X, test_X, train_y, test_y):
    # 建立 SVC 模型
    svc = svm.SVC()
    temp_X = list(map(lambda el: el["vec"], train_X))
    svc_fit = svc.fit(temp_X, train_y)

    # 預測
    temp_X = list(map(lambda el: el["vec"], test_X))
    _y = svc.predict(temp_X)

    # 準確
    acc = metrics.accuracy_score(test_y, _y)
    print(acc)
    
    return _y

# Core Model

In [10]:
def core(X, y, total_ctr, fields_ctr):
    DIM = 300
    
    def to_vec(tokens, label):    
        vec = np.zeros(DIM)
        tokens = set(tokens) ###
        for i, t in enumerate(tokens):
            if t == '': continue
#             if t not in total_ctr: 
#                 print(t)
#                 continue ###

            ### model core
            vec += model.get_numpy_vector(t)
#             vec += (model.get_numpy_vector(t) * fields_ctr[label][t] / total_ctr[t])
        return vec
    
    return list(map(lambda pair: {"vec": to_vec(pair[0], pair[1]), "src": pair[0]}, zip(X, y)))

# Start Experiment

In [6]:
Raw_dir = "./Raw_Data/no_filter/"
Raw_filter_dir = "./Raw_Data/filter/"
Career_dir = "./Raw_Data/career_builder/"

Data_dir = "./Data/"

In [11]:
# filter or no_filter 自切自測
raw_contents = read_dir_json(Raw_dir)
X, y, labels = to_data(raw_contents)

train_X, test_X, train_y, test_y = split_data(X, y)

total_ctr, fields_ctr = get_ctr(train_X, train_y)

vec_train_X = core(train_X, train_y, total_ctr, fields_ctr)
vec_test_X = core(test_X, test_y, total_ctr, fields_ctr)

_y = train_and_predict(vec_train_X, vec_test_X, train_y, test_y)

Backend.all.json 3300
Front-End-Developer.nofilter.NY.json 3630
android.2294.version3.json 2294
security.2622.version2.json 2622
Training data length: 9476, Test data length: 2370
0.737130801688


In [8]:
# 同時用 filter and no_filter
raw_filter_contents = read_dir_json(Raw_filter_dir)
raw_contents = read_dir_json(Raw_dir)

train_X, train_y, labels = to_data(raw_contents)
test_X, test_y, labels = to_data(raw_filter_contents)
total_ctr, fields_ctr = get_ctr(train_X, train_y)
 
# 0.0743710957285 raw filter
# 0.0160230073952
vec_train_X = core(train_X, train_y, total_ctr, fields_ctr)
vec_test_X = core(test_X, test_y, total_ctr, fields_ctr)

_y = train_and_predict(vec_train_X, vec_test_X, train_y, test_y)

Security.filter.jobtitle.withid.json 776
andriod.filter.jobtitle.withid.json 433
Backend.filter.jobtitle.withid.json 394
Frontend.filter.jobtitle.withid.json 831
Backend.all.json 3300
Front-End-Developer.nofilter.NY.json 3630
android.2294.version3.json 2294
security.2622.version2.json 2622
thymeleaf
Drupal_Commerce
sprite
Craft_CMS
react-redux
Webix
Unbounce
styled_components
sprite
jekyll
Sitefinity
thymeleaf
screenshot
Bulma
thymeleaf
Quill
Kentico_CMS
react-redux
Chartjs
styled_components
Craft_CMS
Webix
styled_components
IBM_Coremetrics
Marionette.js
thymeleaf
0.0160230073952


In [None]:
# 用 career builder data 自切自測
raw_contents = read_dir_json(Career_dir, 'job_Description')
X, y, labels = to_data(raw_contents)

train_X, test_X, train_y, test_y = split_data(X, y)

total_ctr, fields_ctr = get_ctr(train_X, train_y)

vec_train_X = core(train_X, train_y, total_ctr, fields_ctr)
vec_test_X = core(test_X, test_y, total_ctr, fields_ctr)

_y = train_and_predict(vec_train_X, vec_test_X, train_y, test_y)

In [None]:
# 同時用 career builder and no_filter
raw_contents1 = read_dir_json(Career_dir, 'job_Description')
raw_contents2 = read_dir_json(Raw_filter_dir)

train_X, train_y, labels = to_data(raw_contents1)
test_X, test_y, labels = to_data(raw_contents2)
total_ctr, fields_ctr = get_ctr(train_X, train_y)

vec_train_X = core(train_X, train_y, total_ctr, fields_ctr)
vec_test_X = core(test_X, test_y, total_ctr, fields_ctr)

_y = train_and_predict(vec_train_X, vec_test_X, train_y, test_y)

# Model Rank
1. 單純將所有 keywords vec 相加：0.84
2. 各自 post 內的 keywords 做比例分配當作 weight ，再做相加：0.82~0.79 // 理論上和第一個方法同個概念
3. 各自 post 內的 keywords 做統計後以 pagerank x^-4/3 做計算： 0.84~0.82

### 不考慮 keywords 出現次數： keyword vector * (field_ctr[word] / total_ctr[word])
4. data: filter，並切 20%，0.98
5. data: no_filter，並切 20%，0.89

6. train: filter,    test: no_filter，0.36
7. train: no_filter, test: filter，0.27

8. data: career_builder，並切 20%，七個領域，0.82
9. data: career_builder，並切 20%，七個領域取同樣數量，0.85

10. train: career,    test: no_filter，0.33
11. train: career,    test: filter，0.23

In [None]:
def get_name(string):
    return string.split(".")[0]

for i, val in enumerate(_y):
    if val == test_y[i]: continue

    print("Answer: {}, Predict: {}\n{}\n".format(get_name(labels[test_y[i]]), get_name(labels[val]), vec_test_X[i]["src"]))