In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

In [2]:
!pip install ktrain



In [3]:
import ktrain
from ktrain import text

# Importing dataset

In [5]:
pos_text_target = pd.read_csv('/content/drive/MyDrive/dataset_patent/text_target/pos_text_target.csv',index_col= False)
neg_text_target = pd.read_csv('/content/drive/MyDrive/dataset_patent/text_target/neg_text_target.csv',index_col= False)
neut_text_target = pd.read_csv('/content/drive/MyDrive/dataset_patent/text_target/neut_text_target.csv',index_col= False)

In [6]:
# Chcking sahpe of dataframe before preprocessing
print(pos_text_target.shape)
print(neg_text_target.shape)
print(neut_text_target.shape)

(65967, 2)
(89105, 2)
(63055, 2)


In [7]:
# Checking null values in the table
print(pos_text_target.isnull().values.any())
print(neg_text_target.isnull().values.any())
print(neut_text_target.isnull().values.any())

True
True
True


In [8]:
# Dropping null values 
pos_text_target.dropna(axis = 0, inplace=True)
neg_text_target.dropna(axis = 0, inplace=True)
neut_text_target.dropna(axis = 0, inplace=True)

In [9]:
# Checking shape of frame after dropping null values 
print(pos_text_target.shape)
print(neg_text_target.shape)
print(neut_text_target.shape)

(65933, 2)
(89030, 2)
(62958, 2)


## Looking into neutral dataset

In [14]:
neut_text_target[neut_text_target.duplicated('text')]

Unnamed: 0,text,target
239,In order to solve the above-described problems...,0
755,The present inventors have conducted diligent ...,0
819,A transmission device of the present invention...,0
877,A fan according to an aspect of the present in...,0
1206,In the case with a grommet with its cylindrica...,0
...,...,...
63037,According to a first aspect of the present inv...,0
63040,"In a first aspect, a radio terminal that is us...",0
63041,Embodiments of the present disclosure provide ...,0
63044,In order to solve the above-described problems...,0


In [15]:
neut_text_target['text'][239]

"In order to solve the above-described problems, a method of producing a composite reinforcing material of the present invention comprises a step of kneading at least a graphite-based carbon material and a reinforcing material into a base material,', 'the graphite-based carbon material having a rhombohedral graphite layer (3R) and a hexagonal graphite layer (2H), wherein a Rate (3R) of the rhombohedral graphite layer (3R) and the hexagonal graphite layer (2H), based on an X-ray diffraction method, which is defined by following Equation 1 is 31% or more:"

In [16]:
neut_text_target[neut_text_target['text']=="In order to solve the above-described problems, a method of producing a composite reinforcing material of the present invention comprises a step of kneading at least a graphite-based carbon material and a reinforcing material into a base material,', 'the graphite-based carbon material having a rhombohedral graphite layer (3R) and a hexagonal graphite layer (2H), wherein a Rate (3R) of the rhombohedral graphite layer (3R) and the hexagonal graphite layer (2H), based on an X-ray diffraction method, which is defined by following Equation 1 is 31% or more:"]

Unnamed: 0,text,target
238,In order to solve the above-described problems...,0
239,In order to solve the above-described problems...,0
59911,In order to solve the above-described problems...,0


In [17]:
# Removing duplicated rows 
neut_text_target.drop_duplicates(keep='first',inplace=True)
print(neut_text_target.shape)

(58043, 2)


In [22]:
# Choosing randomly 50K samples for furthre processing 
neut_text_target_rs = neut_text_target.sample(n=50000, random_state=1)

## Looking into positive dataset

In [10]:
# Checking wether there is duplicate values or not
pos_text_target[pos_text_target.duplicated('text')]

Unnamed: 0,text,target
180,The heat-absorbing glass plate of the present ...,1
229,The present invention can provide a surface-en...,1
406,The present invention can process data accordi...,1
464,The present invention can provide a surface-en...,1
543,"According to the above, mechanical characteris...",1
...,...,...
65954,An information communication method disclosed ...,1
65956,An embodiment of the present invention provide...,1
65958,According to a method of deriving a merge cand...,1
65959,According to a method of deriving a merge cand...,1


In [11]:
pos_text_target['text'][180]

'The heat-absorbing glass plate of the present invention satisfies both low solar transmittance and high visible light transmittance. Particularly, according to the present invention, it is possible to obtain a heat-absorbing glass having a ratio Tv/Te of the visible light transmittance higher than conventional glass, relative to the amount of total iron t-Fe2O3 as calculated as Fe2O3 as represented by mass % based on oxides.'

In [12]:
# Finding rows having same text data
pos_text_target[pos_text_target['text']=='The heat-absorbing glass plate of the present invention satisfies both low solar transmittance and high visible light transmittance. Particularly, according to the present invention, it is possible to obtain a heat-absorbing glass having a ratio Tv/Te of the visible light transmittance higher than conventional glass, relative to the amount of total iron t-Fe2O3 as calculated as Fe2O3 as represented by mass % based on oxides.']

Unnamed: 0,text,target
179,The heat-absorbing glass plate of the present ...,1
180,The heat-absorbing glass plate of the present ...,1


In [13]:
# Removing duplicated rows and printing shape
pos_text_target.drop_duplicates(keep='first',inplace=True)
print(pos_text_target.shape)

(59106, 2)


In [23]:
# Choosing randomly 50K samples for furthre processing 
pos_text_target_rs = pos_text_target.sample(n=50000, random_state=1)

## Looking into negative dataset 

In [18]:
# Checking wether there is duplicate values or not
neg_text_target[neg_text_target.duplicated('text')]

Unnamed: 0,text,target
310,"However, the methods disclosed in Patent Liter...",-1
467,An aspect of the present invention provides a ...,-1
469,An object of the present invention is to provi...,-1
647,An object of the present invention is to provi...,-1
725,It is therefore an object of the present inven...,-1
...,...,...
89086,An object of the present invention is to provi...,-1
89090,The present invention has been made in effort ...,-1
89091,"However, according to the technique described ...",-1
89095,"In the LAA, when an unallocated frequency band...",-1


In [19]:
neg_text_target['text'][467]

"An aspect of the present invention provides a method and apparatus for encoding an image, which can improve image compression efficiency.', 'Another aspect of the present invention provides a method and apparatus for decoding an image, which can improve image compression efficiency.', 'Still another aspect of the present invention provides a method and apparatus for inter prediction, which can improve image compression efficiency.', 'Still yet another aspect of the present invention provides a method and apparatus for deriving a temporal motion vector predictor, which can improve image compression efficiency."

In [20]:
# Finding rows having same text data
neg_text_target[neg_text_target['text'] == "An aspect of the present invention provides a method and apparatus for encoding an image, which can improve image compression efficiency.', 'Another aspect of the present invention provides a method and apparatus for decoding an image, which can improve image compression efficiency.', 'Still another aspect of the present invention provides a method and apparatus for inter prediction, which can improve image compression efficiency.', 'Still yet another aspect of the present invention provides a method and apparatus for deriving a temporal motion vector predictor, which can improve image compression efficiency."]

Unnamed: 0,text,target
228,An aspect of the present invention provides a ...,-1
467,An aspect of the present invention provides a ...,-1
88758,An aspect of the present invention provides a ...,-1


In [21]:
# Removing duplicated rows 
neg_text_target.drop_duplicates(keep='first',inplace=True)
print(neg_text_target.shape)

(79531, 2)


In [24]:
# Choosing randomly 50K samples for furthre processing 
neg_text_target_rs = neg_text_target.sample(n=50000, random_state=1)

In [30]:
# For further use we need class label (target 0, 1, 2) so change accordingly for negative
neg_text_target_rs['target'] = 2

In [31]:
# Combining dataframe 
frames = [neut_text_target_rs, pos_text_target_rs, neg_text_target_rs]
combined = pd.concat(frames)

In [32]:
print(combined.shape)
combined.head()

(150000, 2)


Unnamed: 0,text,target
517,"In one aspect of the present invention, a wet ...",0
10380,One of aspects of the present disclosure resid...,0
33632,"In order to solve the foregoing problem, a mot...",0
46778,In order to solve the conventional problems de...,0
54133,As a result of having conducted diligent resea...,0


In [33]:
combined = shuffle(combined, random_state=0)
combined.reset_index(inplace=True, drop=True)
combined.head()

Unnamed: 0,text,target
0,An image forming apparatus of the present inve...,0
1,"However, in the conventional aerial vehicle de...",2
2,The first aspect of a method for recovering a ...,1
3,"First Aspect of Invention', 'The present inven...",0
4,"As described above, according to the cap, the ...",1


In [34]:
text.print_text_classifiers()

fasttext: a fastText-like model [http://arxiv.org/pdf/1607.01759.pdf]
logreg: logistic regression using a trainable Embedding layer
nbsvm: NBSVM model [http://www.aclweb.org/anthology/P12-2018]
bigru: Bidirectional GRU with pretrained fasttext word vectors [https://fasttext.cc/docs/en/crawl-vectors.html]
standard_gru: simple 2-layer GRU with randomly initialized embeddings
bert: Bidirectional Encoder Representations from Transformers (BERT) from keras_bert [https://arxiv.org/abs/1810.04805]
distilbert: distilled, smaller, and faster BERT from Hugging Face transformers [https://arxiv.org/abs/1910.01108]


In [35]:
# Crating 80-20 split and procced further  
(x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(combined, 
                                                                   'text', # name of column containing text
                                                                   label_columns=['target'],
                                                                   maxlen=1000, 
                                                                   max_features=100000,
                                                                   preprocess_mode='standard',
                                                                   val_pct=0.2,
                                                                   ngram_range=2)

['target_0', 'target_1', 'target_2']
        target_0  target_1  target_2
138368       0.0       0.0       1.0
83558        0.0       0.0       1.0
92925        1.0       0.0       0.0
117438       1.0       0.0       0.0
58354        1.0       0.0       0.0
['target_0', 'target_1', 'target_2']
        target_0  target_1  target_2
45523        0.0       1.0       0.0
113023       1.0       0.0       0.0
75415        1.0       0.0       0.0
86907        0.0       1.0       0.0
107527       0.0       0.0       1.0
language: en
Word Counts: 82412
Nrows: 120000
120000 train sequences
train sequence lengths:
	mean : 306
	95percentile : 1179
	99percentile : 2023
Adding 2-gram features
max_features changed to 1990425 with addition of ngrams
Average train sequence length with ngrams: 611
train (w/ngrams) sequence lengths:
	mean : 612
	95percentile : 2357
	99percentile : 4045
x_train shape: (120000,1000)
y_train shape: (120000, 3)
Is Multi-Label? False
30000 test sequences
test sequence lengths

In [36]:
# Using NBSVM, train SVM model 
model = text.text_classifier('nbsvm', (x_train, y_train) , preproc=preproc)
learner = ktrain.get_learner(model, 
                             train_data=(x_train, y_train), 
                             val_data=(x_test, y_test), 
                             batch_size=32)

print('-----------------------------------------------------------------------------------------')
print('Starting training and validation')
learner.fit_onecycle(5e-5, 10)

Is Multi-Label? False
compiling word ID features...
maxlen is 1000
building document-term matrix... this may take a few moments...
rows: 1-10000
rows: 10001-20000
rows: 20001-30000
rows: 30001-40000
rows: 40001-50000
rows: 50001-60000
rows: 60001-70000
rows: 70001-80000
rows: 80001-90000
rows: 90001-100000
rows: 100001-110000
rows: 110001-120000
computing log-count ratios...
done.
-----------------------------------------------------------------------------------------
Starting training and validation


begin training using onecycle policy with max lr of 5e-05...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fd376a47a50>