In [None]:
import codecs
import json
import pandas as pd
import numpy as np

In [None]:
pub_dict = {}
with codecs.open("data/train/train_pub.json", "r", "utf-8") as f:
    pub_dict = json.load(f)
ad_pair = pd.read_csv("data/pair_data.csv", encoding="utf-8")

In [None]:
ad_pair.head()

In [None]:
positive_sample = ad_pair[ad_pair['2']==1][:1000]
negative_sample = ad_pair[ad_pair['2']==0][:1000]
sample_all = positive_sample.append(negative_sample)
sample_all = sample_all.values

In [None]:
positive_sample[542:543]

In [None]:
negative_sample[542:543]

In [None]:
pub_dict['vvpj0rbO']

In [None]:
from utils import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from beard.similarity import AbsoluteDifference
from beard.similarity import CosineSimilarity
from beard.similarity.pairs import MyCosineSimilarity
from beard.similarity.pairs import MyJaccardSimilarity
from beard.similarity import JaccardSimilarity
from beard.similarity import PairTransformer
from beard.similarity import StringDistance
from beard.similarity import EstimatorTransformer
from beard.similarity import ElementMultiplication
from beard.utils import FuncTransformer
from beard.utils import Shaper
import pickle

In [None]:
def _build_distance_estimator(X, y, Xt, yt, verbose=0):
    """Build a vector reprensation of a pair of signatures."""
    transformer = FeatureUnion([
        ("author_name", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("full_name", FuncTransformer(func=get_authors)),
                ("shaper", Shaper(newshape=(-1,))),
                ("tf-idf", TfidfVectorizer(analyzer="char_wb",
                                           ngram_range=(2, 4),
                                           dtype=np.float32,
                                           decode_error="replace")),
            ]))),
            ("combiner", CosineSimilarity())
        ])),
        ("affiliation_similarity", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("affiliation", FuncTransformer(func=get_author_affiliations)),
                ("shaper", Shaper(newshape=(-1,))),
                ("tf-idf", TfidfVectorizer(analyzer="char_wb",
                                           ngram_range=(2, 4),
                                           decode_error="replace")),
            ]))),
            ("combiner", CosineSimilarity())
        ])),
        ("title_similarity", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("title", FuncTransformer(func=get_title)),
                ("shaper", Shaper(newshape=(-1,))),
                ("tf-idf", TfidfVectorizer(analyzer="char_wb",
                                           ngram_range=(2, 4),
                                           dtype=np.float32,
                                           decode_error="replace")),
            ]))),
            ("combiner", CosineSimilarity())
        ])),
        # ("journal_similarity", Pipeline([
        #     ("pairs", PairTransformer(element_transformer=Pipeline([
        #         ("journal", FuncTransformer(func=get_journal)),
        #         ("shaper", Shaper(newshape=(-1,))),
        #         ("tf-idf", TfidfVectorizer(analyzer="char_wb",
        #                                    ngram_range=(2, 4),
        #                                    dtype=np.float32,
        #                                    decode_error="replace")),
        #     ]))),
        #     ("combiner", CosineSimilarity())
        # ])),
        ("venue_similarity", Pipeline([
            ("pairs", FuncTransformer(func=get_venue)),
            ("combiner", MyJaccardSimilarity())
        ])),
        ("abstract_similarity", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("abstract", FuncTransformer(func=get_abstract)),
                ("shaper", Shaper(newshape=(-1,))),
                ("tf-idf", TfidfVectorizer(analyzer="char_wb",
                                           ngram_range=(2, 4),
                                           decode_error="replace")),
            ]))),
            ("combiner", CosineSimilarity())
        ])),
        ("keywords_similarity", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("keywords", FuncTransformer(func=get_keywords)),
                ("shaper", Shaper(newshape=(-1,))),
                ("tf-idf", TfidfVectorizer(analyzer="char_wb",
                                           ngram_range=(2, 4),
                                           decode_error="replace")),
            ]))),
            ("combiner", CosineSimilarity())
        ])),
        ("year_diff", Pipeline([
            ("pairs", FuncTransformer(func=get_year, dtype=np.int)),
            ("combiner", AbsoluteDifference())
        ]))
    ])

    clf = GradientBoostingClassifier(n_estimators=80,
                                     max_depth=10,
                                     max_features=7,
                                     learning_rate=0.129,
                                     verbose=verbose)
    estimator = Pipeline([("transformer", transformer),
                          ("clf", clf)]).fit(X, y)
    y_pred = estimator.predict(Xt)
    print("\tPrecision: %1.3f" % precision_score(yt, y_pred))
    print("\tRecall: %1.3f" % recall_score(yt, y_pred))
    print("\tF1: %1.3f\n" % f1_score(yt, y_pred))

    return estimator

In [None]:
def learn_model(pub_dict, sample=True, verbose=0):
    """Learn the distance model for pairs of signatures.
    """
    input_dataset = sample_all if sample else ad_pair
    np.random.shuffle(input_dataset)
    train, test = train_test_split(input_dataset, train_size=0.7)
    X, y = train[:, :2], train[:, 2].astype(int)
    Xt, yt = test[:, :2], test[:, 2].astype(int)
    
    for i in range(len(X)):
        X[i][0] = pub_dict[X[i][0]]
        X[i][1] = pub_dict[X[i][1]]
    for i in range(len(Xt)):
        Xt[i][0] = pub_dict[Xt[i][0]]
        Xt[i][1] = pub_dict[Xt[i][1]]
    # Learn a distance estimator on paired signatures
    distance_estimator = _build_distance_estimator(
        X, y, Xt, yt, verbose=verbose)

    pickle.dump(distance_estimator,
                open("distance_model", "wb"),
                protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
learn_model(pub_dict, sample=True, verbose=0)

In [None]:
np.random.shuffle(sample_all)
train, test = train_test_split(sample_all, train_size=0.7)
X, y = train[:, :2], train[:, 2].astype(int)
Xt, yt = test[:, :2], test[:, 2].astype(int)

for i in range(len(X)):
    X[i][0] = pub_dict[X[i][0]]
    X[i][1] = pub_dict[X[i][1]]

In [None]:
" ".join(" ")

In [None]:
def get_keywords(s):
    res = " "
    v = s["keywords"] if 'keywords' in s and s['keywords'] is not None and len(s['keywords']) else ' '
    if v[0] == '':
        return res
    if len(v):
        res = " ".join(v)
    else:
        res = " "
    return res
def get_abstract(s):
    v = s["abstract"] if 'abstract' in s and s['abstract'] is not None else ' '
    if not v:
        v = " "
    return v

In [None]:
a = {'abstract': 'This paper discusses the current status of harmful algal blooms diagnosis and points out some of defects: time consuming and laborious. This paper described how to apply J2EE platform to forming an exact and efficient microscopic image diagnosis system of harmful algal blooms based on MVC model and by the technologies such as JSP, Servlets, EJB, JDBC etc.. ©2010 IEEE.',
 'authors': [{'name': 'Liang Lv',
   'org': 'Department of Electronic Engineering'},
  {'name': 'Guangrong Ji', 'org': 'Department of Electronic Engineering'},
  {'name': 'Chunfeng Guo', 'org': 'Department of Electronic Engineering'},
  {'name': 'Xiang Gao', 'org': 'Department of Electronic Engineering'}],
 'id': '5GnAKqKW',
 'keywords': ['Harmful algal blooms',
  'J2EE',
  'Microscopic image',
  'MVC',
  'Red ride phytoplankton identification'],
 'title': 'Design of microscopic image diagnosis system basedon MVC model and J2EE platform',
 'venue': '2010 The 2nd International Conference on Computer and Automation Engineering, ICCAE 2010',
 'year': 2010}

In [None]:
get_keywords({"keywords":['']})

In [None]:
get_abstract({"abstract":''})