In [1]:
from os import listdir
from collections import Counter, defaultdict
from functools import reduce
import json
import math
import operator
import numpy as np

from pyfasttext import FastText
model = FastText('../wiki.en.bin')

from sklearn.model_selection import train_test_split
from sklearn import feature_extraction, svm, metrics
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [43]:
score_file = 'tfidf_score_filter.json'
train = 'filter_Dice'
test = 'no_filter_Dice'

TRAIN_PATH = '/home/fun/Atos/new_Steeve_data/' + train + '/can/'
TEST_PATH = '/home/fun/Atos/new_Steeve_data/' + test + '/can/'


In [44]:
ls '/home/fun/Atos/new_Steeve_data/CareerBuilder/can/'

Keywordsdatabase-administrator.txt  Keywordsproduct-manager.txt
Keywordsjava-developer.txt          Keywordssystems-analyst.txt
Keywordsnetwork-technician.txt      Keywordsweb-developer.txt


In [45]:
def get_tfidf(doc):
    vectorizer  = CountVectorizer(min_df=1, token_pattern=r'[\w\+\.#-]+')  # 該類會將文本中的詞語轉換為詞頻矩陣，矩陣元素a[i][j] 表示j詞在i類文本下的詞頻  
    transformer = TfidfTransformer()                                       # 該類會統計每個詞語的tf-idf權值  
    tfidf = transformer.fit_transform(vectorizer.fit_transform(doc))       # 第一個fit_transform是計算tf-idf，第二個fit_transform是將文本轉為詞頻矩陣  
    
    words   = vectorizer.get_feature_names()                                # 獲取詞袋模型中的所有詞語  
    weight = tfidf.toarray()                                               # 將tf-idf矩陣抽取出來，元素a[i][j]表示j詞在i類文本中的tf-idf權重  
    
    tfidf_score = defaultdict(lambda: defaultdict())
    for i in range(len(weight)):                                           # 打印每類文本的tf-idf詞語權重，第一個for遍歷所有文本，第二個for便利某一類文本下的詞語權重  
#         print("-------這裡輸出第",i,u"類文本的詞語tf-idf權重------")
        for j in range(len(words)):
            tfidf_score[i][words[j]] = weight[i][j]
#             tfidf_score[index_to_field[i]][words[j]] = weight[i][j]
#             print(words[j], weight[i][j])
            
#     with open(score_file, 'w', encoding='utf8') as ws:
#         ws.write(json.dumps(tfidf_score))
    return tfidf_score, words

def norm_pl(pl):
    return pl.lower().replace(' ', '_')

In [50]:
def get_data(directory, limit=None):
    X, y, index_to_field, field_to_index = [], [], {}, {}
    for i, file in enumerate(listdir(directory)):
        if file.startswith("."): continue
            
        doc = json.loads(open(directory + file, 'r', encoding='utf8').read())
        posts = sum(doc.values(), [])
        posts = posts[:limit] if limit else posts
        print(file, "posts length: {}".format(len(posts)))
        
        index_to_field[i], field_to_index[file] = file, i
        for post in posts:
            X.append([norm_pl(p) for p in post['PL']])
            y.append(i)
    return X, y, index_to_field, field_to_index

def get_doc(X, y): # for tfidf
    doc = defaultdict(lambda: [])
    for pls, field in zip(X, y):
        doc[field] += pls
    
    doc = [' '.join(doc[i]) for i in range(len(doc))]
    return doc
        
def tfidf_predict(test_X):
    _y, unexist = [], Counter()
    for i, post in enumerate(test_X):
        _y.append(get_possible_field(post, unexist))
    print(unexist)
    return _y

def get_possible_field(post, unexist=Counter()):
    scores = defaultdict(lambda: 0.0)
    for pl in post:
        pl = norm_pl(pl)
        if pl not in words:
            unexist[pl] += 1
            continue
                
        for field, score_table in tfidf_score.items():
            scores[field] += score_table[pl]
                
    try:
        index = max(scores.items(), key=operator.itemgetter(1))[0]
        return index
    except:
        return 0 # 隨機猜
    

train_X, train_y, index_to_field, field_to_index = get_data(TRAIN_PATH)
tfidf_score, words = get_tfidf(get_doc(train_X, train_y))
test_X, test_y, _, _ = get_data(TEST_PATH)
_y = tfidf_predict(test_X)

print(metrics.accuracy_score(test_y, _y))
print(metrics.classification_report(test_y, _y))

Keywordsandroid.txt posts length: 433
Keywordsbackend.txt posts length: 394
Keywordsfrontend.txt posts length: 831
Keywordssecurity.txt posts length: 776
Keywordsandroid.txt posts length: 1827
Keywordsbackend.txt posts length: 2886
Keywordsfrontend.txt posts length: 3325
Keywordssecurity.txt posts length: 3395
Counter({'matlab': 87, 'conditional': 78, 'knockout.js': 55, 'swing': 44, 'vhdl': 43, 'jax-ws': 41, 'gwt': 40, 'labview': 31, 'odata': 30, 'episerver': 30, 'msbuild': 28, 'comscore': 27, 'xpath': 26, 'jaxb': 26, 'silverstripe': 26, 'dynatrace': 25, 'plsql': 24, 'blade': 21, 'tensorflow': 20, 'log4j': 18, 'bugzilla': 18, 'formatting': 17, 'yaml': 17, 'nuget': 16, 'redmine': 16, 'xquery': 15, 'suse': 14, 'spring-boot': 13, 'stl': 13, 'webstorm': 13, 'mfc': 12, 'directx': 12, 'comparison': 12, 'openssl': 12, 'atlassian_jira': 12, 'cuda': 11, 'vaadin': 11, 'keras': 10, 'heap': 9, 'vsto': 9, 'react-native': 8, 'wildfly': 8, 'xamarin.ios': 8, 'cmake': 8, 'xamarin.forms': 8, 'yii2': 8, 

In [48]:
def to_feature(X):
    new_X = []
    for post in X:
        vec = np.zeros(300)
        
        ########## MODEL ###########
        field = get_possible_field(post)
        post = set(post) # unique
        for pl in post:
            if pl == '': continue
            if pl not in words: continue
            
            vec += model.get_numpy_vector(pl) * tfidf_score[field][pl]
        # vec / len(pls)
        new_X.append(vec)
    return new_X

def train_and_predict(train_X, test_X, train_y, test_y):
    # 建立 SVC 模型
    svc = svm.SVC()
    svc_fit = svc.fit(train_X, train_y)

    # 預測
    _y = svc.predict(test_X)
    return _y

Keywordsandroid.txt posts length: 433
Keywordsbackend.txt posts length: 394
Keywordsfrontend.txt posts length: 831
Keywordssecurity.txt posts length: 776
Keywordsandroid.txt posts length: 1827
Keywordsbackend.txt posts length: 2886
Keywordsfrontend.txt posts length: 3325
Keywordssecurity.txt posts length: 3395
0.704364558733
             precision    recall  f1-score   support

          0       0.89      0.60      0.72      1827
          1       0.50      0.66      0.57      2886
          2       0.65      0.55      0.60      3325
          3       0.90      0.95      0.92      3395

avg / total       0.72      0.70      0.71     11433



In [None]:
train_X, train_y, index_to_field, field_to_index = get_data(TRAIN_PATH)
tfidf_score, words = get_tfidf(get_doc(train_X, train_y))
test_X, test_y, _, _ = get_data(TEST_PATH)

train_X = to_feature(train_X)
test_X = to_feature(test_X)
    
_y = train_and_predict(train_X, test_X, train_y, test_y)

print(metrics.accuracy_score(test_y, _y))
print(metrics.classification_report(test_y, _y))

In [53]:
X, y, index_to_field, field_to_index = get_data('/home/fun/Atos/new_Steeve_data/CareerBuilder/can/', limit=1299)

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2)
print("Training data length: {}, Test data length: {}".format(len(train_X), len(test_X)))
tfidf_score, words = get_tfidf(get_doc(train_X, train_y))

_y = tfidf_predict(test_X)
print("tfidf model")
print(metrics.accuracy_score(test_y, _y))
print(metrics.classification_report(test_y, _y))

train_X = to_feature(train_X)
test_X = to_feature(test_X)

_y = train_and_predict(train_X, test_X, train_y, test_y)
print("SVM + tfidf model")
print(metrics.accuracy_score(test_y, _y))
print(metrics.classification_report(test_y, _y))

Keywordsjava-developer.txt posts length: 1299
Keywordssystems-analyst.txt posts length: 1299
Keywordsweb-developer.txt posts length: 1299
Keywordsnetwork-technician.txt posts length: 1299
Keywordsdatabase-administrator.txt posts length: 1299
Keywordsproduct-manager.txt posts length: 1299
Training data length: 6235, Test data length: 1559
Counter({'robin': 5, 'sails.js': 4, 'silverstripe': 3, 'vaadin': 3, 'raphael': 3, 'telescope': 2, 'emacs': 2, 'phpstorm': 2, 'koa': 2, 'webtrends': 2, 'ipb': 2, 'javaserver_pages': 1, 'ms-access': 1, 'shiny': 1, 'apache2': 1, 'webstorm': 1, 'grouping': 1, 'accessible_portal': 1, 'activerecord': 1, 'celery': 1, 'tealium': 1, 'gallery': 1, 'opengl': 1, 'rust': 1, 'adobe_robohelp': 1, 'storyboard': 1, 'eval': 1, 'oracle_commerce': 1, 'prospector': 1, 'richfaces': 1})
tfidf model
0.542655548428


  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.77      0.68      0.72       284
          1       0.38      0.66      0.48       239
          3       0.71      0.66      0.69       269
          4       0.44      0.74      0.55       273
          5       0.65      0.50      0.57       231
          6       0.00      0.00      0.00       263

avg / total       0.49      0.54      0.50      1559

SVM + tfidf model
0.565105837075
             precision    recall  f1-score   support

          0       0.78      0.68      0.73       284
          1       0.44      0.56      0.49       239
          3       0.71      0.62      0.66       269
          4       0.53      0.38      0.44       273
          5       0.68      0.50      0.58       231
          6       0.42      0.64      0.51       263

avg / total       0.60      0.57      0.57      1559



# 新 pl filter

### 單純 tfidf model

* filter / no_filter: 0.7064637452986967
* no_filter / filter: 0.9252259654889071

### 考慮次數的 vec 相加

* filter / no_filter: 0.636753258112
* no_fitler / filter: 0.912078882498

### 不考慮次數的 vec 相加

* filter / no_filter: 0.718883932476
* no_fitler / filter: 0.928101889893

### 不考慮次數的 vec * tfidf 再相加 （錯誤版）

* filter / no_filter: 0.917694393423
* no_fitler / filter: 0.965488907149

### 不考慮次數的 vec * tfidf 再相加 （正確版）因為並不會取得 label 所以用第一個 model 先預測 field 再做相乘

* filter / no_filter: 0.704364558733
* no_fitler / filter: 0.916187345933


================================

# 舊 pl filter

### 單純 tfidf model
* filter / no_filter 0.699
* no_filter / filter 0.9227608874281019