# Baseline model

Just predict true if tf-idf cosin distance is closer than threshold.

This is intended for simplest end-to-end solution.

# Training-Test set setup

Dataset setup is common for all model.
Currently just put in ipynb. Please dup! (maybe factor out to .py file in the future).

In [1]:
import pandas as pd
import numpy as np

In [2]:
grants_all_df = pd.read_pickle("../data/grants_2012_from2017_xmldf.dat")

In [3]:
app_all_df = pd.read_pickle("../data/app_2017_by2012_xmldf.dat")

### Split training-test set setup

If app_id is uniq, we can just use dataframe.sample.
But somemodel might want to use multiple xml for the same app_id.

So keep all app_id in split phase.
Also, app_id order would be the same order as apply (maybe not, please confirm somebody!).
So keep order when split, Then shuffle so that every one reproduce split even though they change mind to use multiple xml of each app_id.

In [4]:
all_appid = set(app_all_df['app_id'])

In [5]:
import random

In [6]:
random.seed(1234)

In [7]:
training_id = set(random.sample(all_appid, int(len(all_appid)*0.9)))

In [10]:
testset_id = all_appid - training_id

In [11]:
len(training_id), len(testset_id)

(2769, 308)

In [12]:
training_app_df = app_all_df[app_all_df.app_id.isin(training_id)]

In [13]:
testset_app_df = app_all_df[~app_all_df.app_id.isin(training_id)]

In [14]:
app_all_df.shape, training_app_df.shape, testset_app_df.shape

((3083, 2), (2775, 2), (308, 2))

In [15]:
training_app_df.head().app_id

0    14742496
1    14348426
2    14613336
3    14053984
4    14590141
Name: app_id, dtype: int64

In [16]:
testset_app_df.iloc[1]

app_id                                             15289343
xml       <?xml version="1.0" encoding="UTF-8"?>\n<!DOCT...
Name: 15, dtype: object

In [20]:
def filter_uniq_appid(df):
    ids = set()
    filtermask = []
    for i in range(len(df)):
        app_id = df.iloc[i].app_id
        filtermask.append(app_id not in ids)
        ids.add(app_id)
    return filtermask


### Keep only first app_id

You can use multiple application xml if you want (in this case, skip filter_uniq_appid for training set).
I keep only first app_id in dataframe. I guess it in order of date, but may be not. Please confirm somebody!

In [18]:
training_app_df.head()

Unnamed: 0,app_id,xml
0,14742496,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
1,14348426,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
2,14613336,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
3,14053984,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
4,14590141,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."


In [25]:
training_app_df = training_app_df[filter_uniq_appid(training_app_df)]
testset_app_df = testset_app_df[filter_uniq_appid(testset_app_df)]

### Shuffle

In [26]:
# set seed again for easier interactive shift-enter
random.seed(456)

In [27]:
training_app_df = training_app_df.sample(frac=1).reset_index(drop=True)
testset_app_df = testset_app_df.sample(frac=1).reset_index(drop=True)

### Reset index (may be you don't want, then skip here)

In [28]:
training_app_df = training_app_df.reset_index(drop=True)
testset_app_df = testset_app_df.reset_index(drop=True)

### Retrieve just claim. Remove all tags.

This utility function might necessary for any mode.

In [29]:
import re

In [30]:
CLAIM_PAT = re.compile(r'<claims[^>]*>(.*)</claims>',re.MULTILINE|re.DOTALL)

In [31]:
TAG_PAT = re.compile(r"<.*?>")

In [32]:
def whole_xml_to_claim_xml(whole):
    mat = CLAIM_PAT.search(whole)
    return mat.group(1)

In [33]:
def whole_xml_to_claim(whole):
    return TAG_PAT.sub(' ', whole_xml_to_claim_xml(whole))

# Model evaluation

In [37]:
%ls ../data/

app_2017_by2012_xmldf.dat
applications.h5
applications_with_office_actions_citing_grants_with_inner_join.csv
[0m[01;31mapplications_with_office_actions_citing_grants_with_inner_join.zip[0m
[34;42mapps[0m/
[01;31mapps.zip[0m
[01;32mcitations.csv[0m*
[01;31mcitations.csv.zip[0m
citations_2012_2017_merged.dat
[34;42mgrants[0m/
grants.h5
[01;32mgrants_2012_from2017_xmldf.dat[0m*
office_actions.csv
[01;32moffice_actions.csv.zip[0m*
patent_applications_df_grants12_app17.dat
[01;32mpatent_grants_dic_grants12_app17.dat[0m*
rejections.csv
[01;31mrejections.csv.zip[0m
[01;34mtestawk[0m/


In [38]:
# this is created in data_collection.ipynb
citations_2012_2017 = pd.read_pickle("../data/citations_2012_2017_merged.dat")

In [40]:
def set_one_answer_appid(labeldf, oneappid):
    cited_patids = citations_2012_2017[citations_2012_2017.app_id == oneappid].parsed
    labeldf.loc[oneappid] = labeldf.columns.isin(cited_patids)

In [41]:
def create_label_df():
    label_df = pd.DataFrame(columns=grants_all_df.parsed.values, dtype=np.bool)
    for appid in testset_app_df.app_id:
        set_one_answer_appid(label_df, appid)
    return label_df

In [44]:
label_df = create_label_df()

In [45]:
label_df.shape

(308, 5424)

### Cofirm label df is correct

In [33]:
testset_app_df.head()

Unnamed: 0,app_id,xml
0,15292885,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
1,15289843,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
2,15331130,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
3,14820847,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
4,15255391,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."


In [182]:
label_df.head()

Unnamed: 0,8245358,8245460,8245733,8245746,8245764,8245780,8245893,8245898,8245901,8245943,...,8341338,8341346,8341427,8341429,8341430,8341457,8341538,8341573,8341732,8341749
15292885,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
15289843,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
15331130,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14820847,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
15255391,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [183]:
label_df.loc[15289843].idxmax()

8233452

In [184]:
citations_2012_2017[citations_2012_2017["app_id"]==15289843].parsed

6092    8270369
6094    8204029
6095    8233452
Name: parsed, dtype: object

In [185]:
label_df.loc[15289843].loc[8270369]

True

In [186]:
label_df.loc[15289843].sum()

3

### Predict test set and print summary

In [197]:
def predict_training_set(predict_func):
    """
    predict_func(claims) return NxM of boolean. N is len(claims). M is rownum of grants_all_df.
            value indicate n claim is cite patent of m row of grants_all_df.
    """
    predictdf = pd.DataFrame(columns=grants_all_df.parsed.values, dtype=np.bool)
    res = predict_func(testset_app_df["xml"].map(whole_xml_to_claim))
    for idx, appid in enumerate(testset_app_df.app_id):
        predictdf.loc[appid] = res[idx, :]
    """
        one_res = predict_func(whole_xml_to_claim(testset_app_df[testset_app_df.app_id == appid]["xml"].iloc[0]))
        predictdf.loc[appid] = one_res
    """
    return predictdf

In [175]:
def calc_TPs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][labeldf.loc[one_appid]]) for one_appid in testset_app_df.app_id])

def calc_FPs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][~labeldf.loc[one_appid]]) for one_appid in testset_app_df.app_id])

def calc_TNs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][~labeldf.loc[one_appid]] == False) for one_appid in testset_app_df.app_id])

def calc_FNs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][labeldf.loc[one_appid]] == False) for one_appid in testset_app_df.app_id])

def calc_TFPNs(preddf, labeldf):
    return calc_TPs(preddf, labeldf), calc_FPs(preddf, labeldf), calc_TNs(preddf, labeldf), calc_FNs(preddf, labeldf)

In [176]:
def calc_summary_TFPNs(TP, FP, TN, FN):
    "return acc, prec, recall, f1."
    return pd.DataFrame(columns=["acc", "prec", "recall", "f1"], data=[[(TP+TN)/(TP+FP+TN+FN), TP/(TP+FP), TP/(TP+FN), 2*TP/(2*TP+FP+FN)]])
    
def calc_summary(preddf, labeldf):
    TP, FP, TN, FN = calc_TFPNs(preddf, labeldf)
    return calc_summary_TFPNs(TP, FP, TN, FN)

### Sample evaluation code for baseline model

predict_tfidf_model is defined below

In [206]:
pred_df = predict_training_set(predict_tfidf_model)

In [207]:
calc_summary(pred_df, label_df)

Unnamed: 0,acc,prec,recall,f1
0,0.906713,0.003739,0.726368,0.007439


# Start baseline model dependent code from here

Now common part is done.
Start model specific cells.

In [50]:
grants_all_df.head()["xml"].map(whole_xml_to_claim)

0    \n \n 1. A pacifier clip, comprising:\n a base...
1    \n \n 1. A supporting clasp which supports a s...
2    \n \n 1. A clip of molded plastics material fo...
3    \n \n 1. A tire inflation system comprising:\n...
4    \n \n 1. A cooling system for a heat-generatin...
Name: xml, dtype: object

In [51]:
grants_all_df["claim"] = grants_all_df["xml"].map(whole_xml_to_claim)

In [52]:
grants_all_df.head()

Unnamed: 0,parsed,xml,claim
0,8245358,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...","\n \n 1. A pacifier clip, comprising:\n a base..."
1,8245460,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1. A supporting clasp which supports a s...
2,8245733,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1. A clip of molded plastics material fo...
3,8245746,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1. A tire inflation system comprising:\n...
4,8245764,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1. A cooling system for a heat-generatin...


# Convert to feature vectors and retrieve vocabulary

Doing similar things to scikit learn example  
http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html

Also, this document is helpful.  
http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

## (only once) Calculate tf-idf for grants2012

In [69]:
random.seed(1234)

In [70]:
vectorizer = TfidfVectorizer(stop_words='english', max_df = 0.5)

In [71]:
grants_features = vectorizer.fit_transform(grants_all_df["claim"])

In [72]:
grants_features.shape

(5424, 28988)

In [73]:
vocab = vectorizer.vocabulary_

In [74]:
idfvec = vectorizer.idf_

In [75]:
len(vocab.keys())

28988

In [76]:
list(vocab.items())[0:5]

[('subsection', 24920),
 ('grommets', 11928),
 ('demagnetization', 7459),
 ('geometrically', 11566),
 ('syndromes', 25376)]

In [77]:
len(idfvec)

28988

In [78]:
idfvec[0:5]

array([ 7.51933164,  5.26803984,  8.905626  ,  8.905626  ,  8.905626  ])

### (only once) Save features, vocabulary, idf vector

In [79]:
import pickle

In [80]:
with open("../data/grants2012_tfidf_features.dat", "wb") as f:
    pickle.dump(grants_features, f)

In [81]:
with open("../data/grants2012_vocab_idf_dict.dat", "wb") as f:
    pickle.dump({"vocabulary": vocab, "idf": idfvec}, f)

### Load code

In [46]:
import pickle

In [47]:
with open("../data/grants2012_tfidf_features.dat", 'rb') as f:
    grants_features = pickle.load(f)

In [48]:
with open("../data/grants2012_vocab_idf_dict.dat", 'rb') as f:
    dic = pickle.load(f)
    vocab, idfvec = dic["vocabulary"], dic["idf"]

### Calculate tf-idf manually using vocabulary and idf vector, and check whether it's coinside.

In [53]:
from sklearn.feature_extraction.text import CountVectorizer

In [54]:
one_claim = grants_all_df.iloc[0]["claim"]

In [55]:
count_vec = CountVectorizer(vocabulary=vocab, stop_words="english", max_df = 0.5)

In [56]:
res = count_vec.fit_transform([one_claim])

In [57]:
res_arr = res.toarray()

In [58]:
res.shape

(1, 28988)

In [59]:
tf = res_arr[0]

In [60]:
answer = grants_features[0, :].toarray()

In [61]:
answer = answer[0]

In [62]:
def print_nonzero_index(arr, maxcount):
    count = 0

    for i, v in enumerate(arr):
        if v != 0:
            count+=1
            print(i)
            if count > maxcount:
                break

In [63]:
print_nonzero_index(answer, 5)

1025
1072
1073
1117
1120
1373


In [64]:
answer[1025]

0.015374346416530774

In [65]:
print_nonzero_index(tf, 5)

1025
1072
1073
1117
1120
1373


In [66]:
sumtf = sum(tf)

In [67]:
unnormalized = [tf[i]*idfvec[i]/sumtf for i, _ in enumerate(tf)]

In [69]:
unnormalized[1025]/np.linalg.norm(unnormalized)

0.015374346416530772

Try two claim for generarization

In [70]:
tfcsr = count_vec.fit_transform(grants_all_df.iloc[0:2]["claim"])

In [71]:
tf = tfcsr.toarray()

In [72]:
tf.shape

(2, 28988)

In [73]:
unnormalized = np.multiply(tf, idfvec)

In [74]:
lpnorms = np.linalg.norm(unnormalized, axis=1)

In [75]:
manual_tfidf = unnormalized/lpnorms[:, np.newaxis]

In [76]:
manual_tfidf[0, 1025]

0.015374346416530776

In [77]:
all(abs(manual_tfidf[0, :] - grants_features[0].toarray()[0]) < 0.00001)

True

In [78]:
all(abs(manual_tfidf[1] - grants_features[1].toarray()[0]) < 0.00001)

True

Now make calculate tf-idf function

In [79]:
def claims_to_tfidfs(claimarr, count_vec, idfvec):
    tfcsr = count_vec.fit_transform(claimarr)
    tf = tfcsr.toarray()
    unnormalized = np.multiply(tf, idfvec)
    lpnorms = np.linalg.norm(unnormalized, axis=1)
    return unnormalized/lpnorms[:, np.newaxis]

In [80]:
manu3 = claims_to_tfidfs(grants_all_df.iloc[0:2]["claim"], count_vec, idfvec)

In [81]:
all(manu3[0] == manual_tfidf[0]), all(manu3[1] == manual_tfidf[1])

(True, True)

### It's time to calculate tfidf for training set.

In [82]:
training_app_df["claim"] = training_app_df["xml"].map(whole_xml_to_claim)

In [83]:
training_features = claims_to_tfidfs(training_app_df["claim"], count_vec, idfvec)

Calculate one cosine distance

In [84]:
one_appid = training_app_df.iloc[0].app_id

In [85]:
citations_2012_2017[citations_2012_2017.app_id == one_appid]

Unnamed: 0,app_id,citation_pat_pgpub_id,parsed,ifw_number,action_type,action_subtype,form892,form1449,citation_in_oa
5510,15267723,8172557,8172557,,,,0,1,0


In [86]:
answer_patids = set(citations_2012_2017[citations_2012_2017.app_id == one_appid].parsed.astype(int))

In [87]:
answer_patids

{8172557}

In [88]:
type(grants_all_df.iloc[0].parsed)

numpy.int64

In [89]:
answer_idxs = grants_all_df[grants_all_df.parsed.isin(answer_patids)].index

In [90]:
answer_idxs[0]

2570

In [91]:
grants_all_df.iloc[2570].parsed

8172557

In [92]:
answer_patent_features = grants_features[answer_idxs[0], :].toarray()[0]

In [94]:
import scipy

In [95]:
scipy.spatial.distance.cdist(training_features[0, :][np.newaxis, :], grants_features[answer_idxs[0], :].toarray(), 'cosine')

array([[ 0.79603206]])

### Calculate 20 cosine distance

In [96]:
training_app_df.head()

Unnamed: 0,app_id,xml,claim
0,15267723,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1 . A reciprocating compressor comprisi...
1,14802907,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1 . A trigger system for a string instr...
2,14797959,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1 . A non-transitory computer readable ...
3,15227804,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1 . A solar panel mounting system compr...
4,14803181,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",\n \n 1 . A grille shutter comprising:\n a ho...


In [99]:
training_app_df[training_app_df.app_id == 15227804].index[0]

3

In [100]:
def calc_cosin_for_one_app(appid):
    answer_patids = set(citations_2012_2017[citations_2012_2017.app_id == appid].parsed.astype(int))
    answer_idxs = grants_all_df[grants_all_df.parsed.isin(answer_patids)].index
    answer_patent_features = grants_features[answer_idxs, :].toarray()
    training_features_idx = training_app_df[training_app_df.app_id == appid].index[0]
    return scipy.spatial.distance.cdist(training_features[training_features_idx, :][np.newaxis, :], answer_patent_features, 'cosine')[0]


In [101]:
calc_cosin_for_one_app(14575586)

array([ 0.93692449])

In [102]:
calc_cosin_for_one_app(15239553)

array([ 0.70069898])

In [103]:
[calc_cosin_for_one_app(appid) for appid in training_app_df[0:20].app_id]

[array([ 0.79603206]),
 array([ 0.84778274]),
 array([ 0.97849135,  0.93972121]),
 array([ 0.81212847]),
 array([ 0.97243849]),
 array([ 0.8968932]),
 array([ 0.55908128,  0.99042701]),
 array([ 0.91275346]),
 array([ 0.77715433]),
 array([ 0.86496786,  0.79024109,  0.89442354]),
 array([ 0.7582598]),
 array([ 0.73837288]),
 array([ 0.85414263,  0.82317616,  0.76937533]),
 array([ 0.70234053,  0.89837677,  0.94242814,  0.85678068,  0.94188531,
         0.78551855]),
 array([ 0.50356797]),
 array([ 0.29920153]),
 array([ 0.77911933]),
 array([ 0.69115609,  0.93321904]),
 array([ 0.75328119]),
 array([ 0.96227157])]

In [104]:
calc_cosin_for_one_app(training_app_df.iloc[5].app_id)

array([ 0.8968932])

In [105]:
calc_cosin_for_one_app(training_app_df.iloc[5].app_id).mean() < 0.95

True

### Compare with random pair cosdistance

In [106]:
scipy.spatial.distance.cdist(training_features[0:5, :], grants_features[0:5, :].toarray(), 'cosine')

array([[ 0.99963607,  0.99633861,  0.9982194 ,  0.99860982,  0.99004204],
       [ 0.9972921 ,  0.95817496,  0.99595848,  0.99875834,  0.99789818],
       [ 0.99997143,  0.99967785,  0.99946175,  0.99713544,  0.99770081],
       [ 0.98963517,  0.99767726,  0.99301124,  0.99045227,  0.99055138],
       [ 0.96051351,  0.95503764,  0.99072163,  0.98204962,  0.98363154]])

In [107]:
training_features = claims_to_tfidfs(training_app_df["claim"], count_vec, idfvec)

In [108]:
grants_features_arr = grants_features.toarray()

In [205]:
TFIDF_MODEL_THRESHOLD=0.95
# TFIDF_MODEL_THRESHOLD=0.8

def predict_tfidf_model(claims):
    """
    return: NxM of boolean. N is len(claims). M is rownum of grants_all_df.
            value indicate n claim is cite patent of m row of grants_all_df.
    """
    features = claims_to_tfidfs(claims, count_vec, idfvec)
    dists = scipy.spatial.distance.cdist(features, grants_features_arr, 'cosine')
    return dists < TFIDF_MODEL_THRESHOLD


### Calc recall of 100

In [156]:
res = predict_tfidf_model(training_app_df[0:100]["claim"])

In [157]:
res.shape

(100, 5424)

In [158]:
all_pred_of_labeltrue = np.array([], dtype=np.bool)

In [159]:
for idx in range(0, 100):
    one_appid = training_app_df.iloc[idx].app_id
    pred_oneres = res[idx]
    label_patids = citations_2012_2017[citations_2012_2017.app_id == one_appid].parsed
    label_idxs = grants_all_df.parsed[grants_all_df.parsed.isin(label_patids)].index
    pred_of_labeltrue = pred_oneres[label_idxs]
    all_pred_of_labeltrue = np.concatenate([all_pred_of_labeltrue, pred_of_labeltrue])

In [162]:
sum(all_pred_of_labeltrue)/len(all_pred_of_labeltrue)

0.76000000000000001

### Check result by hand (seems correct)

In [122]:
training_app_df.iloc[0]

app_id                                             15267723
xml       <?xml version="1.0" encoding="UTF-8"?>\n<!DOCT...
claim     \n \n  1 . A reciprocating compressor comprisi...
Name: 0, dtype: object

In [114]:
pred_oneres = res[0]

In [120]:
pred_oneres.sum()

110

In [124]:
citations_2012_2017[citations_2012_2017.app_id == 15267723].parsed

5510    8172557
Name: parsed, dtype: object

In [130]:
grants_all_df.parsed[grants_all_df.parsed == 8172557].index

Int64Index([2570], dtype='int64')

In [131]:
pred_oneres[2570]

True

### Why test set recall is so bad?

In [163]:
res = predict_tfidf_model(testset_app_df["xml"].map(whole_xml_to_claim))

In [166]:
testset_app_df.shape

(308, 2)

In [167]:
all_pred_of_labeltrue = np.array([], dtype=np.bool)

In [168]:
for idx in range(0, testset_app_df.shape[0]):
    one_appid = testset_app_df.iloc[idx].app_id
    pred_oneres = res[idx]
    label_patids = citations_2012_2017[citations_2012_2017.app_id == one_appid].parsed
    label_idxs = grants_all_df.parsed[grants_all_df.parsed.isin(label_patids)].index
    pred_of_labeltrue = pred_oneres[label_idxs]
    all_pred_of_labeltrue = np.concatenate([all_pred_of_labeltrue, pred_of_labeltrue])

In [170]:
sum(all_pred_of_labeltrue)/len(all_pred_of_labeltrue)

0.72636815920398012