In [1]:
import os
import pandas as pd
import numpy as np
from konlpy.tag import Twitter
import tensorflow as tf
import math
from sklearn import datasets
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
from scipy.sparse import csr_matrix
from scipy.sparse import issparse
pos_tagger = Twitter()
%matplotlib inline

In [2]:
MODEL_DIRECTORY = "../"

patent_data_all_file_name = "patent_data_all.pkl"
patent_data_all_dir = os.path.join(MODEL_DIRECTORY, patent_data_all_file_name)

STOP_WORD_KOREAN = "stop_word_korea.txt"
STOP_WORD_KOREAN_DIR = os.path.join(MODEL_DIRECTORY, STOP_WORD_KOREAN)

SAMPLING_PATENT_DATA_file_name = "sampling_patent_data.pkl"
SAMPLING_PATENT_DATA_dir = os.path.join(MODEL_DIRECTORY, SAMPLING_PATENT_DATA_file_name)

SAMPLING_TAGGING_DATA = "tagging_data_all.pkl"
SAMPLING_TAGGING_DATA_DIR = os.path.join(MODEL_DIRECTORY, SAMPLING_TAGGING_DATA)

SAMPLING_PATENT_TF_IDF_NP = "sampling_patent_tf_idf.npy"
SAMPLING_PATENT_TF_IDF_NP_DIR = os.path.join(MODEL_DIRECTORY, SAMPLING_PATENT_TF_IDF_NP)

SAMPLING_TF_IDF_VECTORIZER = "sampling_tf_idf_vectorizer.pkl"
SAMPLING_TF_IDF_VECTORIZER_DIR = os.path.join(MODEL_DIRECTORY, SAMPLING_TF_IDF_VECTORIZER)

In [3]:
import pickle

df_patent_data_all = pd.read_pickle(SAMPLING_PATENT_DATA_dir)
patet_tf_idf = np.load(SAMPLING_PATENT_TF_IDF_NP_DIR)

patent_vector = patet_tf_idf
df_patent_data_all = df_patent_data_all.ix[:len(patent_vector), :]

len(patent_vector), len(df_patent_data_all)

In [None]:
target_row = [key for key, value in enumerate(df_patent_data_all["ipc_4digit"].notnull().tolist()) if value == False]

df_patent_data_all = df_patent_data_all[df_patent_data_all["ipc_4digit"].notnull()]
patent_vector = np.delete(patent_vector, target_row, 0)

len(target_row), len(patent_vector), len(df_patent_data_all)

In [None]:
target_row = [key for key, value in enumerate(df_patent_data_all["ipc_4digit"].str.startswith("4").tolist()) if value == True]

df_patent_data_all = df_patent_data_all[ ~(df_patent_data_all["ipc_4digit"].str.startswith("4")) ]
patent_vector = np.delete(patent_vector, target_row, 0)

len(patent_vector), len(df_patent_data_all)

In [None]:
temp_list = ",".join([str(value).upper()[0] for value in list(df_patent_data_all["ipc_4digit"].values)])
ipc_1digit_set = set([ipc.strip() for ipc in temp_list.split(",")])
ipc_1digit_set

In [None]:
len(ipc_1digit_set)

ipc_1digit_dict = {}
for i, value in enumerate("ABCDEFGH"):
    ipc_1digit_dict[value] = i

ipc_1digit_dict

In [None]:
y_ipc_1digit = np.zeros( (len(patent_vector), len(ipc_1digit_set)) , dtype=np.int)

y_ipc_1digit.shape

In [None]:
for i, value in enumerate(df_patent_data_all["ipc_4digit"].values):
    ipc_list = [ipc_1digit_dict[ipc.strip()[0].upper()] for ipc in str(value).split(",")]
    y_ipc_1digit[i][ipc_list] = 1

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    patent_vector, y_ipc_1digit, test_size=0.60, random_state=42)

In [None]:
del y_ipc_1digit 
del patent_vector
del df_patent_data_all

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
def make_index(y):
    one_result = []
    zero_result = []
    len_of_x_data = len(y)
    for i in y:
        one_result.append(np.where(i ==1.0)[0])
        zero_result.append(np.where(i==0.0)[0])
    return [one_result,zero_result]

def select_index_data_of_output(output,index_list):
    data_value_list = []
    for i in range(66248):
        data_value_list.append([])
        cnt = 0
        data_value_list[i].append([])
        for k in range(len(index_list[0][i])): 
            data_value_list[i][cnt].append(output[i][index_list[0][i][k]])
        cnt +=1
        data_value_list[i].append([])
        for j in range(len(index_list[1][i])):
            data_value_list[i][cnt].append(output[i][index_list[1][i][j]])
    return data_value_list

        
def make_cartesian_list(x):
    cartesian_list = []
    yi_length = []
    nyi_length = []
    for i in range(len(x)):
        yi_length.append(len(x[i][0]))
        nyi_length.append(len(x[i][1]))
        cartesian_list.append([])
        for j in x[i][0]:
            for k in x[i][1]:
                cartesian_list[i].append([j,k])
    return cartesian_list,yi_length,nyi_length

def bp_mll_exp_function(cartesian_list_instance):
    return tf.exp(cartesian_list_instance[1]-cartesian_list_instance[0])

def get_error(cartesian_list):
    cnt = 0
    final_global_error = 0
    global_error = 0

    for docu in cartesian_list[0]:

        for instance in docu:
            global_error +=  bp_mll_exp_function(instance)
        global_error  = global_error * (1/(cartesian_list[1][cnt]*cartesian_list[2][cnt]))
        final_global_error += global_error

        cnt +=1
    return final_global_error

In [None]:
class RankResults:
    def __init__(self):
        self.predictedLabels = []
        self.topRankedLabels = []
        self.outputs = []

    def add(self, predict_set, top_label, output):
        self.predictedLabels.append(predict_set)
        self.topRankedLabels.append(top_label)
        self.outputs.append(output)

In [None]:
X = tf.placeholder("float", [None,19243] )
W = tf.Variable(tf.random_uniform([19243,100],-0.5,0.5))
W2 = tf.Variable(tf.random_uniform([100,8],-0.5,0.5))
B1 = tf.Variable(tf.zeros([100]))
B2 = tf.Variable(tf.zeros([8]))
# Construct model
First_hidden = tf.tanh(tf.matmul(X,W)+B1)
print("1")
hypothesis = tf.tanh((tf.matmul(First_hidden,W2)+B2))
print("2")
y_data_index = make_index(y_train)
print("3")
x_data_index_value = select_index_data_of_output(hypothesis,y_data_index)
print("4")
cartesian_list= make_cartesian_list(x_data_index_value)
print("5")
cost=get_error(cartesian_list)

a = tf.Variable(0.2)
optimizer = tf.train.GradientDescentOptimizer(a)
train = optimizer.minimize(cost)

init = tf.initialize_all_variables()

with tf.Session() as sess:
    sess.run(init)
    for step in range(500):
        sess.run(train,feed_dict={X:train_data_trans})
        print(sess.run(hypothesis,feed_dict={X:train_data_trans}))
        print(sess.run(cost,feed_dict={X:train_data_trans}))
    a = sess.run(hypothesis,feed_dict={X:train_data_trans})