In [104]:
from os import listdir
from collections import Counter, defaultdict
from functools import reduce
import json
import math
import operator
import numpy as np

from pyfasttext import FastText
model = FastText('../wiki.en.bin')

from sklearn.model_selection import train_test_split
from sklearn import feature_extraction, svm, metrics
from sklearn.grid_search import GridSearchCV  
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer



In [2]:
def get_tfidf(doc, file):
    vectorizer  = CountVectorizer(min_df=1, token_pattern=r'[\w\+\.#-]+')  # 該類會將文本中的詞語轉換為詞頻矩陣，矩陣元素a[i][j] 表示j詞在i類文本下的詞頻  
    transformer = TfidfTransformer()                                       # 該類會統計每個詞語的tf-idf權值  
    tfidf = transformer.fit_transform(vectorizer.fit_transform(doc))       # 第一個fit_transform是計算tf-idf，第二個fit_transform是將文本轉為詞頻矩陣  
    
    words  = vectorizer.get_feature_names()                                # 獲取詞袋模型中的所有詞語  
    weight = tfidf.toarray()                                               # 將tf-idf矩陣抽取出來，元素a[i][j]表示j詞在i類文本中的tf-idf權重  
    
    tfidf_score = defaultdict(lambda: defaultdict())
    for i in range(len(weight)):                                           # 打印每類文本的tf-idf詞語權重，第一個for遍歷所有文本，第二個for便利某一類文本下的詞語權重  
#         print("-------這裡輸出第",i,u"類文本的詞語tf-idf權重------")
        for j in range(len(words)):
#             tfidf_score[i][words[j]] = weight[i][j]
            tfidf_score[index_to_field[i]][words[j]] = weight[i][j]
#             print(words[j], weight[i][j])
            
#     with open(file, 'w', encoding='utf8') as ws:
#         ws.write(json.dumps(tfidf_score))
    return tfidf_score, words

def norm_pls(pls):
    return [pl.lower().replace(' ', '_').replace('.js', '') for pl in pls]

In [3]:
def get_data(directory, limit=None):
    X, y, index_to_field, field_to_index = [], [], {}, {}
    files = [file for file in listdir(directory) if not file.startswith(".")]
    for i, file in enumerate(files):
        doc = json.loads(open(directory + file, 'r', encoding='utf8').read())
        posts = sum(doc.values(), [])
        posts = posts[:limit] if limit else posts
        print(file, "posts length: {}".format(len(posts)))
        
        index_to_field[i], field_to_index[file] = file, i
        for post in posts:
            X.append(norm_pls(post['PL']))
            y.append(i)
    return X, y, index_to_field, field_to_index

def get_doc(X, y): # for tfidf
    doc = defaultdict(lambda: [])
    for pls, field in zip(X, y):
        doc[field] += pls
    
    doc = [' '.join(doc[i]) for i in range(len(doc))]
    return doc
        
def tfidf_predict(test_X):
    _y, unexist = [], Counter()
    for i, post in enumerate(test_X):
        _y.append(get_possible_field(post, unexist))
    print(unexist)
    return _y

def get_possible_field(post, unexist=Counter()):
    scores = defaultdict(lambda: 0.0)
    pls = norm_pls(post)
    for pl in pls:
        if pl not in words:
            unexist[pl] += 1
            continue
                
        for field, score_table in tfidf_score.items():
            scores[field_to_index[field]] += score_table[pl]
                
    try:
        index = max(scores.items(), key=operator.itemgetter(1))[0]
        return index
    except:
        return 0 # 隨機猜


In [100]:
def to_feature(X):
    new_X = []
    for post in X:
        field = get_possible_field(post)
        vec = to_vec(field, post)
        new_X.append(vec)
    return new_X

########## MODEL ###########
def to_vec(field, post): # field: index
    field = index_to_field[field]
    top_pl = list(filter(lambda x: x != '' and x in words, post))
    top_pl = sorted(top_pl, key=lambda x: tfidf_score[field][x], reverse=True)[:4]

    if len(top_pl) == 0:
        vec = np.zeros(1200)
    else:
        top_pl += top_pl[0] + top_pl[0] + top_pl[0] + top_pl[0]
        vec = [col for pl in top_pl[:4] for col in model.get_numpy_vector(pl)]
    return vec

    
    
#     vec = np.zeros(300)
#     post = set(post) # unique
#     for pl in post:
#         if pl == '': continue
#         if pl not in words: continue
            
#         vec += model.get_numpy_vector(pl) * tfidf_score[index_to_field[field]][pl]
#     return vec

def train_and_predict(train_X, test_X, train_y, test_y):
    # 建立 SVC 模型
    svc = svm.SVC(kernel='poly', probability=True, max_iter=100)
    svc_fit = svc.fit(train_X, train_y)

    # 預測
    _y = svc.predict(test_X)
    return _y

In [94]:
ls '/home/fun/Atos/new_Steeve_data/'

[0m[01;34mfilter_CareerBuilder[0m/  [01;34mno_filter_CareerBuilder[0m/  [01;34mresume[0m/
[01;34mfilter_Dice[0m/           [01;34mno_filter_Dice[0m/


In [95]:
score_file = 'tfidf_score_cb_filter.json'
train = 'no_filter_Dice'
test = 'no_filter_CareerBuilder'

TRAIN_PATH = '/home/fun/Atos/new_Steeve_data/' + train + '/can/'
TEST_PATH = '/home/fun/Atos/new_Steeve_data/' + test + '/can/'

In [None]:
train_X, train_y, index_to_field, field_to_index = get_data(TRAIN_PATH)
tfidf_score, words = get_tfidf(get_doc(train_X, train_y), score_file)
test_X, test_y, _, _ = get_data(TEST_PATH)
_y = tfidf_predict(test_X)

print(metrics.accuracy_score(test_y, _y))
print(metrics.classification_report(test_y, _y))

Keywordsandroid.txt posts length: 3600
Keywordsbackend.txt posts length: 3540
Keywordsfrontend.txt posts length: 3600
Keywordspm.txt posts length: 3600
Keywordssa.txt posts length: 3600
Keywordssecurity.txt posts length: 3600
Keywordsandroid.txt posts length: 868
Keywordsbackend.txt posts length: 950
Keywordsfrontend.txt posts length: 999
Keywordspm.txt posts length: 7500
Keywordssa.txt posts length: 7500
Keywordssecurity.txt posts length: 2500
Counter({'methode': 33, 'ms-word': 8, 'observable': 8, 'hugo': 7, 'catwalk': 6, 'nvd3': 4, 'blogger': 4, 'latex': 4, 'duplicates': 4, 'tumblr': 3, 'oracle_commerce': 3, 'discourse': 3, 'pycharm': 2, 'smarty': 2, 'cfml': 2, 'sencha-touch': 2, 'timer': 2, 'prolog': 2, 'adobe_coldfusion': 2, 'quantcast': 2, 'indy': 2, 'activex': 2, 'caddy': 2, 'rcms': 2, 'prospector': 2, 'oracle_application_server': 2, 'virtualenv': 2, 'opengrok': 1, 'afnetworking': 1, 'ios6': 1, 'image-processing': 1, 'apache_wicket': 1, 'codemirror': 1, 'appnexus': 1, 'passport':

In [101]:
train_X, train_y, index_to_field, field_to_index = get_data(TRAIN_PATH)
tfidf_score, words = get_tfidf(get_doc(train_X, train_y), score_file)
test_X, test_y, _, _ = get_data(TEST_PATH)

train_X = to_feature(train_X)
test_X = to_feature(test_X)
    
_y = train_and_predict(train_X, test_X, train_y, test_y)

print(metrics.accuracy_score(test_y, _y))
print(metrics.classification_report(test_y, _y))

Keywordsandroid.txt posts length: 3600
Keywordsbackend.txt posts length: 3540
Keywordsfrontend.txt posts length: 3600
Keywordspm.txt posts length: 3600
Keywordssa.txt posts length: 3600
Keywordssecurity.txt posts length: 3600
Keywordsandroid.txt posts length: 868
Keywordsbackend.txt posts length: 950
Keywordsfrontend.txt posts length: 999
Keywordspm.txt posts length: 7500
Keywordssa.txt posts length: 7500
Keywordssecurity.txt posts length: 2500




0.270364719201
             precision    recall  f1-score   support

          0       0.50      0.00      0.01       868
          1       0.21      0.01      0.01       950
          2       0.31      0.09      0.14       999
          3       0.48      0.18      0.26      7500
          4       0.38      0.46      0.42      7500
          5       0.07      0.24      0.11      2500

avg / total       0.37      0.27      0.27     20317



In [107]:
train_X, train_y, index_to_field, field_to_index = get_data(TRAIN_PATH)
tfidf_score, words = get_tfidf(get_doc(train_X, train_y), score_file)
test_X, test_y, _, _ = get_data(TEST_PATH)

train_X = to_feature(train_X)
test_X = to_feature(test_X)

# Set the parameters by cross-validation  
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1, 0.1, 0.01, 1e-3, 1e-4], 'C': [0.1, 1, 10]}, 
                    {'kernel': ['poly'], 'degree': [2, 3, 4, 5, 6, 7, 8], 'coef0': [0, 0.1, 0.01, 1, 10], 'C': [0.1, 1, 10]},
                    {'kernel': ['linear'], 'C': [0.1, 1, 10]}]  
  
scores = ['precision', 'recall']  
  
for score in scores:  
    print("# Tuning hyper-parameters for %s" % score)  
    print()  
  
    clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5,  
                       scoring='%s_weighted' % score)  
    clf.fit(train_X, train_y)  
  
    print("Best parameters set found on development set:")  
    print()  
    print(clf.best_params_)  
    print()  
    print("Grid scores on development set:")  
    print()  
    for params, mean_score, scores in clf.grid_scores_:  
        print("%0.3f (+/-%0.03f) for %r"  
              % (mean_score, scores.std() * 2, params))  
    print()  
  
    print("Detailed classification report:")  
    print()  
    print("The model is trained on the full development set.")  
    print("The scores are computed on the full evaluation set.")  
    print()  
    y_true, y_pred = test_y, clf.predict(test_X)  
    print(classification_report(y_true, y_pred))  
    print()  

Keywordsandroid.txt posts length: 3600
Keywordsbackend.txt posts length: 3540
Keywordsfrontend.txt posts length: 3600
Keywordspm.txt posts length: 3600
Keywordssa.txt posts length: 3600
Keywordssecurity.txt posts length: 3600
Keywordsandroid.txt posts length: 868
Keywordsbackend.txt posts length: 950
Keywordsfrontend.txt posts length: 999
Keywordspm.txt posts length: 7500
Keywordssa.txt posts length: 7500
Keywordssecurity.txt posts length: 2500
# Tuning hyper-parameters for precision



KeyboardInterrupt: 

### Posts 長度數量不拘

#### no_filter / CB
* tfidf 傳統法: 0.530344046857
* svm\*tfidf : 0.541763055569

#### CB / no_filter
* tfidf 傳統法: 0.625858867224
* svm\*tfidf: 0.580501392758

#### filter CB / filter Dice
* tfidf: 0.732615083252
* svm\*tfidf: 0.758220232265

### Posts 長度數量限制

#### CB / no_filter 1000:
* tfidf: 0.637975858867
* svm*tfidf: 0.656360259981

# 新 pl filter

### 單純 tfidf model

* filter / no_filter: 0.7064637452986967
* no_filter / filter: 0.9252259654889071

### 考慮次數的 vec 相加

* filter / no_filter: 0.636753258112
* no_fitler / filter: 0.912078882498

### 不考慮次數的 vec 相加

* filter / no_filter: 0.718883932476
* no_fitler / filter: 0.928101889893

### 不考慮次數的 vec * tfidf 再相加 （錯誤版）

* filter / no_filter: 0.917694393423
* no_fitler / filter: 0.965488907149

### 不考慮次數的 vec * tfidf 再相加 （正確版）因為並不會取得 label 所以用第一個 model 先預測 field 再做相乘

* filter / no_filter: 0.704364558733
* no_fitler / filter: 0.916187345933


================================

# 舊 pl filter

### 單純 tfidf model
* filter / no_filter 0.699
* no_filter / filter 0.9227608874281019

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

user_pls = ['JavaScript', 'html', 'vue.js', 'css']
field = 'Keywordsfrontend.txt'
index = field_to_index[field]

content = json.loads(open('/home/fun/Atos/new_Steeve_data/filter_Dice/can/' + field, 'r', encoding='utf8').read())
posts = sum(content.values(), [])

user_pls = norm_pls(user_pls)
user_vec = to_vec(field_to_index[field], user_pls)

# posts_vec = [to_vec(index, norm_pls(post['PL'])) for post in posts]   
# distances = cosine_similarity([user_vec], posts_vec).squeeze()
#  / len(set(norm_pls(post['PL'])))
distances = [len(set(user_pls).intersection(set(norm_pls(post['PL'])))) for post in posts]

sim = sorted(zip(posts, distances), key=lambda pair: pair[1], reverse=True)
sim[:10]

In [None]:
limit = 1299
score_file = 'tfidf_score_cb.json'

X, y, index_to_field, field_to_index = get_data('/home/fun/Atos/new_Steeve_data/CareerBuilder/can/', limit)
print(index_to_field)
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2)
print("Training data length: {}, Test data length: {}".format(len(train_X), len(test_X)))
tfidf_score, words = get_tfidf(get_doc(train_X, train_y), score_file)

###### 傳統 tfidf 法
_y = tfidf_predict(test_X)
print("tfidf model")
print(metrics.accuracy_score(test_y, _y))
print(metrics.classification_report(test_y, _y))

###### svm * tfidf
train_X = to_feature(train_X)
test_X = to_feature(test_X)

_y = train_and_predict(train_X, test_X, train_y, test_y)
print("SVM + tfidf model")
print(metrics.accuracy_score(test_y, _y))
print(metrics.classification_report(test_y, _y))