# Code to reproduce the findings in the paper:
S.Sarica & J.Luo. Stopwords in Technical Language Processing

All the files can be found in following dropbox folder:
https://www.dropbox.com/sh/hsuum451kyhp2km/AAD49aUd3ut_xICj0WRoG2rIa?dl=0

#### !!! These files should be copied under ./data folder. !!!

In [None]:
import string
import csv
import pickle
import random

import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk import word_tokenize

from sklearn import metrics

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout

In [None]:
data_folder = './data/'

In [None]:
punct = string.punctuation
temp = []
for i in range(len(punct)):
    if punct[i]=='-' or punct[i]=='/':
        temp.append(i)
temp = sorted(temp, reverse=True)
for i in temp:
    punct = punct[:i]+punct[i+1:]
translator = str.maketrans('', '', punct)

In [None]:
# NLTK and USPTO Stopwords
nltk_stops = set(stopwords.words('english'))
USPTO_stops = []

with open (data_folder+'USPTO_stopwords.csv') as f:
    sreader = csv.reader(f)
    USPTO_stops = [x[0] for x in sreader]
set_stops = nltk_stops.union(set(USPTO_stops))

## token stats w/o nltk+USPTO stops

In [None]:
def entropy(key, vocab):
    ent = vocab[key]['docs']
    sum_ = sum(ent)
    entropy = 0
    for x in ent:
        entropy += x/sum_*np.log(x/sum_)
    return -entropy

In [None]:
vocab = {}
patent_num = 0
with open(data_folder + 'patents_titles_abstracts_line_sentence_preprocessed.txt',
          encoding = 'utf-8') as f1:
    with open(data_folder + 'line_sentence_patentnumbers.txt',
              encoding = 'utf-8') as f2:
        pat_no = ''
        flag = 0
        temp = []
        count = 0
        while True:
            try:
                temp1 = next(f1)[:-1]
                temp2 = next(f2)[:-1]
                if temp2!= pat_no:
                    patent_num += 1
                    temp = [x for x in temp if x not in punct]
                    counts = Counter(temp)
                    for key in counts.keys():
                        if vocab.get(key, False) == False:
                            vocab[key] = {'docs':[], 'count':0, 'tf_doc':[]}
                        vocab[key]['docs'] += [counts[key]]
                        vocab[key]['count'] += counts[key]
                        vocab[key]['tf_doc'] +=[counts[key]/len(temp)]

                    pat_no = temp2
                    temp = []                
                temp += word_tokenize(temp1)
                count+=1
                
            except Exception as e:
                print(count, e)
                break

In [None]:
count = 0
for key in vocab.keys():
    if vocab[key].get('tfidf',-1)==-1:
        vocab[key]['idf'] = np.log(patent_num/len(vocab[key]['tf_doc']))
        vocab[key]['tfidf'] = 1/len(vocab[key]['docs'])*\
                                    sum(vocab[key]['tf_doc'])*patent_num/len(vocab[key]['tf_doc'])
        vocab[key]['entropy'] = entropy(key, vocab)                         
    count+=1

In [None]:
# vocabulary sorted by decreasing entropy
sorted_entropy_vocab = sorted([(item[0],item[1]['entropy']) for item in vocab.items()\
                               if item[0] not in set_stops and item[1]['count']>1],
                              key = lambda x:x[1], reverse = True)

In [None]:
# vocabulary sorted by increasing modified tfidf
sorted_tfidf_vocab = sorted([(item[0],item[1]['tfidf']) for item in vocab.items()\
                             if item[0] not in set_stops and item[1]['count']>1], 
                            key = lambda x:x[1], reverse = False)

In [None]:
# vocabulary sorted by decreasing term count
sorted_f_vocab = sorted([(item[0],item[1]['count']) for item in vocab.items()\
                         if item[0] not in set_stops and item[1]['count']>1], 
                        key = lambda x:x[1], reverse = True)

In [None]:
# vocabulary sorted by decreasing term count
sorted_idf_vocab = sorted([(item[0],item[1]['idf']) for item in vocab.items()\
                         if item[0] not in set_stops and item[1]['count']>1], 
                        key = lambda x:x[1], reverse = False)

### technical stopwords list

In [None]:
with open(data_folder + 'technical_stopwords.txt') as f:
    tech_stops = f.readlines()
tech_stops = set([x.strip().replace(' ', '_') for x in tech_stops])

## TEXT CLASSIFICATION w/ LSTM model

In [None]:
#randomly selected patents from three different CPC subgroups for each CPC section
with open(data_folder + 'random_patents_topic_modelling.pkl', 'rb') as f:
    random_patents = pickle.load(f)

In [None]:
#selected CPC subgroups
select_cpc = ["A01K", "B01D", "C06B", "D21F", "E01H", "F02B", "G06F", "H04B"]
#labels
labs = [int(i/100) for i in range(800)]

In [None]:
patents = []
for cpc in select_cpc:
    patents+=random_patents[cpc]
patents = sorted(patents)

In [None]:
#reading patent texts from preprocessed file

pats__ = {}
pats__ = {num:"" for num in patents}
with open(data_folder + 'patents_titles_abstracts_line_sentence_preprocessed.txt',
          'r', encoding = 'utf-8') as f1:
    with open(data_folder + 'line_sentence_patentnumbers.txt', 'r',
              encoding = 'utf-8') as f2:
        c = 0
        while True:
            try:
                temp = next(f1).strip()
                temp_num = next(f2).strip()
            except:
                break
            c += 1
            if temp_num.isdigit():
                temp_num = int(temp_num)
                if temp_num in pats__.keys() and pats__[temp_num] == '':
                    pats__[temp_num] += temp + ' '
pats__ = {key:value for key, value in pats__.items() if value}

#remove the patents which do not have any stopwords
#we need this to measure the effectiveness of filtering stopwords
#for topic modelling tasks

to_pop = []
for key,value in pats__.items():
    if not any(x in word_tokenize(value) for x in tech_stops):
        to_pop.append(key)
       
for key in to_pop:
    pats__.pop(key)
    
    
#creating the final patents list to be randomly selected from
pats_secs = [[] for x in range(8)]
for i,cpc in enumerate(select_cpc):
    for pat in random_patents[cpc]:
        if pats__.get(pat):
            pats_secs[i].append(pat)

In [None]:
pats_secs_flatten = [item for sublist in pats_secs for item in sublist]
texts = [pats__[x] for x in pats_secs_flatten]

In [None]:
#preprocessing text for second and third models (removing stopwords from the text)
texts_1 = [" ".join([x for x in word_tokenize(y) if x not in set_stops]) for y in texts]
texts_2 = [" ".join([x for x in word_tokenize(y) if x not in set_stops+tech_stops]) for y in texts]

In [None]:
MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 500
EMBEDDING_DIM = 300

In [None]:
#defining tokenizer
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters=punct, lower=True)
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

In [None]:
X1 = tokenizer.texts_to_sequences(texts_1)
X1 = pad_sequences(X1, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X1.shape)

In [None]:
X2 = tokenizer.texts_to_sequences(texts_2)
X2 = pad_sequences(X2, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X2.shape)

In [None]:
#true labels
Y = []
for i,y in enumerate(pats_secs):
    for t in range(len(y)):
        Y.append([1 if i == x else 0 for x in range(8)])
Y= np.array(Y)

In [None]:
#preparing train and test samples
#randomly selected 100 from each CPC goes to test set
sum_ = 0
X_test = []
X_train = []
Y_test = []
Y_train = []
X1_test = []
X1_train = []

X2_test = []
X2_train = []

for x in counts:
    rand_indices = random.sample(list(range(sum_, sum_+x)), int(x/10))
    test_indices = random.sample(list(range(sum_, sum_+x)), 100)
    Y_test += [Y[i] for i in test_indices]
    Y_train += [Y[i] for i in range(sum_, sum_+x) if i not in rand_indices]
    X_test += [X[i] for i in test_indices]
    X_train += [X[i] for i in range(sum_, sum_+x) if i not in rand_indices]
    X1_test += [X1[i] for i in test_indices]
    X1_train += [X1[i] for i in range(sum_, sum_+x) if i not in rand_indices]
    X2_test += [X2[i] for i in test_indices]
    X2_train += [X2[i] for i in range(sum_, sum_+x) if i not in rand_indices]
    
    sum_ += x
X_test = np.array(X_test)
X_train = np.array(X_train)
Y_test = np.array(Y_test)
Y_train = np.array(Y_train)
X1_test = np.array(X1_test)
X1_train = np.array(X1_train)
X2_test = np.array(X2_test)
X2_train = np.array(X2_train)

In [None]:
#Model for the train set with raw texts
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(8, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
#this model can be loaded using following line instead of training
model = keras.models.load_model(data_folder+'model0_lstm.h5')

In [None]:
epochs = 5
batch_size = 64

history = model.fit(X_train, 
                    Y_train, 
                    epochs=epochs, 
                    batch_size=batch_size,
                    validation_split=0.1,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/5
541/541 [==============================] - 946s 2s/step - loss: 1.3021 - accuracy: 0.5392 - val_loss: 0.8420 - val_accuracy: 0.6261

Epoch 2/5
541/541 [==============================] - 975s 2s/step - loss: 0.5993 - accuracy: 0.8224 - val_loss: 0.6938 - val_accuracy: 0.7982

Epoch 3/5
541/541 [==============================] - 854s 2s/step - loss: 0.4694 - accuracy: 0.8681 - val_loss: 0.5783 - val_accuracy: 0.8323

Epoch 4/5
541/541 [==============================] - 946s 2s/step - loss: 0.3172 - accuracy: 0.9107 - val_loss: 0.5560 - val_accuracy: 0.8521

Epoch 5/5
541/541 [==============================] - 920s 2s/step - loss: 0.4282 - accuracy: 0.8794 - val_loss: 0.7425 - val_accuracy: 0.7805

In [None]:
Y_pred = [np.argmax(x) for x in model.predict(X_test)]
y_test = [np.argmax(x) for x in Y_test]
print(metrics.classification_report(y_test, Y_pred, digits = 3))

              precision    recall  f1-score   support

           0      0.779     0.950     0.856       100
           1      0.600     0.990     0.747       100
           2      1.000     0.800     0.889       100
           3      0.962     0.750     0.843       100
           4      0.985     0.650     0.783       100
           5      0.952     0.790     0.863       100
           6      0.852     0.980     0.912       100
           7      0.967     0.880     0.921       100
    accuracy                          0.849       800
    macro avg     0.887     0.849     0.852       800
    weighted avg  0.887     0.849     0.852       800

In [None]:
#model for the train set without NLTK+USPTO stopwords
model1 = Sequential()
model1.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X1.shape[1]))
model1.add(SpatialDropout1D(0.2))
model1.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model1.add(Dense(8, activation='softmax'))
model1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model1.summary())

In [None]:
#this model can be loaded using following line instead of training
model1 = keras.models.load_model(data_folder+'model1_lstm.h5')

In [None]:
epochs = 5
batch_size = 64

history1 = model1.fit(X1_train, 
                    Y_train, 
                    epochs=epochs, 
                    batch_size=batch_size,
                    validation_split=0.1,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/5
541/541 [==============================] - 1088s 2s/step - loss: 1.2982 - accuracy: 0.5717 - val_loss: 0.6061 - val_accuracy: 0.8230

Epoch 2/5
541/541 [==============================] - 1101s 2s/step - loss: 0.4025 - accuracy: 0.8811 - val_loss: 0.5473 - val_accuracy: 0.8672

Epoch 3/5
541/541 [==============================] - 1017s 2s/step - loss: 0.2330 - accuracy: 0.9353 - val_loss: 0.3290 - val_accuracy: 0.9180

Epoch 4/5
541/541 [==============================] - 852s 2s/step - loss: 0.1376 - accuracy: 0.9652 - val_loss: 0.5836 - val_accuracy: 0.8464

Epoch 5/5
541/541 [==============================] - 837s 2s/step - loss: 0.0995 - accuracy: 0.9749 - val_loss: 0.7046 - val_accuracy: 0.8391

In [None]:
Y1_pred = [np.argmax(x) for x in model1.predict(X1_test)]

In [None]:
print(metrics.classification_report(y_test, Y1_pred, digits = 3))

              precision    recall  f1-score   support

           0      0.925     0.980     0.951       100
           1      0.908     0.990     0.947       100
           2      1.000     0.880     0.936       100
           3      1.000     0.920     0.958       100
           4      0.971     1.000     0.985       100
           5      0.952     0.990     0.971       100
           6      0.935     1.000     0.966       100
           7      1.000     0.910     0.953       100
    accuracy                          0.959       800
    macro avg     0.961     0.959     0.959       800
    weighted avg  0.961     0.959     0.959       800

In [None]:
#model for the train set without NLTK+USPTO+technical stopwords
model2 = Sequential()
model2.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X2.shape[1]))
model2.add(SpatialDropout1D(0.2))
model2.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model2.add(Dense(8, activation='softmax'))
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model2.summary())

In [None]:
#this model can be loaded using following line instead of training
model2 = keras.models.load_model(data_folder+'model2_lstm.h5')

In [None]:
epochs = 5
batch_size = 64

history2 = model2.fit(X2_train, 
                    Y_train, 
                    epochs=epochs, 
                    batch_size=batch_size,
                    validation_split=0.1,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/5
541/541 [==============================] - 841s 2s/step - loss: 1.0325 - accuracy: 0.6675 - val_loss: 0.3970 - val_accuracy: 0.9190

Epoch 2/5
541/541 [==============================] - 1037s 2s/step - loss: 0.3455 - accuracy: 0.8992 - val_loss: 0.6197 - val_accuracy: 0.8019

Epoch 3/5
541/541 [==============================] - 1451s 3s/step - loss: 0.1996 - accuracy: 0.9468 - val_loss: 0.4184 - val_accuracy: 0.9029

Epoch 4/5
541/541 [==============================] - 1049s 2s/step - loss: 0.1178 - accuracy: 0.9706 - val_loss: 0.3750 - val_accuracy: 0.9013

Epoch 5/5
541/541 [==============================] - 892s 2s/step - loss: 0.1051 - accuracy: 0.9735 - val_loss: 0.4060 - val_accuracy: 0.9063

In [None]:
Y2_pred = [np.argmax(x) for x in model2.predict(X2_test)]
print(metrics.classification_report(y_test, Y2_pred, digits = 3))

              precision    recall  f1-score   support

           0      0.990     0.960     0.975       100
           1      0.917     0.990     0.952       100
           2      1.000     0.960     0.980       100
           3      0.989     0.900     0.942       100
           4      0.961     0.980     0.970       100
           5      0.952     0.990     0.971       100
           6      0.962     1.000     0.980       100
           7      1.000     0.980     0.990       100

    accuracy                          0.970       800
    macro avg     0.971     0.970     0.970       800
    weighted avg  0.971     0.970     0.970       800