# Code to reproduce the findings in the paper:
S.Sarica & J.Luo. Stopwords in Technical Language Processing

All the files can be found in following dropbox folder:
https://www.dropbox.com/sh/hsuum451kyhp2km/AAD49aUd3ut_xICj0WRoG2rIa?dl=0

In [None]:
import string
from nltk.corpus import stopwords
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.metrics import normalized_mutual_info_score as nmi

In [2]:
punct = string.punctuation
temp = []
for i in range(len(punct)):
    if punct[i]=='-' or punct[i]=='/':
        temp.append(i)
temp = sorted(temp, reverse=True)
for i in temp:
    punct = punct[:i]+punct[i+1:]
translator = str.maketrans('', '', punct)

In [None]:
# NLTK and USPTO Stopwords
nltk_stops = set(stopwords.words('english'))
USPTO_stops = []

with open ('./data/USPTO_stopwords.csv') as f:
    sreader = csv.reader(f)
    USPTO_stops = [x[0] for x in sreader.readlines()]
set_stops = nltk_stops.union(set(USPTO_stops))

## token stats w/o nltk+USPTO stops

In [215]:
def entropy(key, vocab):
    ent = vocab[key]['docs']
    sum_ = sum(ent)
    entropy = 0
    for x in ent:
        entropy += x/sum_*np.log(x/sum_)
    return -entropy

In [None]:
data_folder = './stopwords_data/'

vocab = {}
patent_num = 0
with open(data_folder + 'patents_titles_abstracts_line_sentence_preprocessed.txt',
          encoding = 'utf-8') as f1:
    with open(data_folder + 'line_sentence_patentnumbers.txt',
              encoding = 'utf-8') as f2:
        pat_no = ''
        flag = 0
        temp = []
        count = 0
        while True:
            try:
                temp1 = next(f1)[:-1]
                temp2 = next(f2)[:-1]
                if temp2!= pat_no:
                    patent_num += 1
                    temp = [x for x in temp if x not in punct]
                    counts = Counter(temp)
                    for key in counts.keys():
                        if vocab.get(key, False) == False:
                            vocab[key] = {'docs':[], 'count':0, 'tf_doc':[]}
                        vocab[key]['docs'] += [counts[key]]
                        vocab[key]['count'] += counts[key]
                        vocab[key]['tf_doc'] +=[counts[key]/len(temp)]

                    pat_no = temp2
                    temp = []                
                temp += word_tokenize(temp1)
                count+=1
                
            except Exception as e:
                print(count, e)
                break

In [None]:
count = 0
for key in vocab.keys():
    if vocab[key].get('tfidf',-1)==-1:
        vocab[key]['idf'] = np.log(patent_num/len(vocab[key]['tf_doc']))
        vocab[key]['tfidf'] = 1/len(vocab[key]['docs'])*\
                                    sum(vocab[key]['tf_doc'])*patent_num/len(vocab[key]['tf_doc'])
        vocab[key]['entropy'] = entropy(key, vocab)                         
    count+=1

In [236]:
# vocabulary sorted by decreasing entropy
sorted_entropy_vocab = sorted([(item[0],item[1]['entropy']) for item in vocab.items()\
                               if item[0] not in set_stops and item[1]['count']>1],
                              key = lambda x:x[1], reverse = True)

In [238]:
# vocabulary sorted by increasing modified tfidf
sorted_tfidf_vocab = sorted([(item[0],item[1]['tfidf']) for item in vocab.items()\
                             if item[0] not in set_stops and item[1]['count']>1], 
                            key = lambda x:x[1], reverse = False)

In [239]:
# vocabulary sorted by decreasing term count
sorted_f_vocab = sorted([(item[0],item[1]['count']) for item in vocab.items()\
                         if item[0] not in set_stops and item[1]['count']>1], 
                        key = lambda x:x[1], reverse = True)

In [240]:
# vocabulary sorted by decreasing term count
sorted_idf_vocab = sorted([(item[0],item[1]['idf']) for item in vocab.items()\
                         if item[0] not in set_stops and item[1]['count']>1], 
                        key = lambda x:x[1], reverse = False)

### technical stopwords list

In [None]:
with open(data_folder + 'technical_stopwords.txt') as f:
    tech_stops = f.readlines()
tech_stops = set([x.strip() for x in tech_stops])

## TOPIC MODELLING

In [None]:
#randomly selected patents from three different CPC subgroups for each CPC section
with open(data_folder + 'random_patents_topic_modelling.pkl', 'rb') as f:
    random_patents = pickle.load(f)

In [None]:
#selected CPC subgroups
select_cpc = ["A01K", "B01D", "C06B", "D21F", "E01H", "F02B", "G06F", "H04B"]
#labels
labs = [int(i/100) for i in range(800)]

In [None]:
patents = []
for cpc in select_cpc:
    patents+=random_patents[cpc]
patents = sorted(patents)

In [None]:
#reading patent texts from preprocessed file

pats__ = {}
pats__ = {num:"" for num in patents}
with open(data_folder + 'patents_titles_abstracts_line_sentence_preprocessed.txt',
          'r', encoding = 'utf-8') as f1:
    with open(data_folder + 'line_sentence_patentnumbers.txt', 'r',
              encoding = 'utf-8') as f2:
        c = 0
        while True:
            try:
                temp = next(f1).strip()
                temp_num = next(f2).strip()
            except:
                break
            c += 1
            if temp_num.isdigit():
                temp_num = int(temp_num)
                if temp_num in pats__.keys() and pats__[temp_num] == '':
                    pats__[temp_num] += temp + ' '
pats__ = {key:value for key, value in pats__.items() if value}

#remove the patents which do not have any stopwords
#we need this to measure the effectiveness of filtering stopwords
#for topic modelling tasks

to_pop = []
for key,value in pats__.items():
    if not any(x in word_tokenize(value) for x in tech_stops):
        to_pop.append(key)
       
for key in to_pop:
    pats__.pop(key)
    
    
#creating the final patents list to be randomly selected from
pats_secs = [[] for x in range(8)]
for i,cpc in enumerate(select_cpc):
    for pat in random_patents[cpc]:
        if pats__.get(pat):
            pats_secs[i].append(pat)

### Topic modelling with NMF and LDA

In [None]:
nmis = []
nmis_ = []

#nmis saves the normalized mutual information measurements for NMF based topic modelling
#while nmis_ saves the same for LDA based topic modelling

for i in range(1000):
    temp_pats = []
    for sec in pats_secs:
        temp_pats += random.sample(sec, 100)
    temp_vocab = {}
    temp_vocab_1 = {}
    temp_vocab_2 = {}
    ind = 0
    ind_1 = 0
    ind_2 = 0
    for key in temp_pats:
        if pats__.get(key):
            value = pats__[key]
            for sent in sent_tokenize(value.strip()):
                for word in word_tokenize(sent):
                    if word not in temp_vocab.keys() and word not in punct:
                        temp_vocab[word] = ind
                        ind += 1
                    if word not in temp_vocab_1.keys() and word not in punct and word not in set_stops:
                        temp_vocab_1[word] = ind_1
                        ind_1 +=1
                    if word not in temp_vocab_2.keys() and word not in punct and word not in set_stops.union(tech_stops):
                        temp_vocab_2[word] = ind_2
                        ind_2 +=1
    tfidf_vectorizer_ = TfidfVectorizer(vocabulary = temp_vocab)
    tfidf_ = tfidf_vectorizer_.fit_transform([pats__[x] for x in temp_pats])
    tfidf_vectorizer_1_ = TfidfVectorizer(vocabulary = temp_vocab_1)
    tfidf_1_ = tfidf_vectorizer_1_.fit_transform([pats__[x] for x in temp_pats])
    tfidf_vectorizer_2_ = TfidfVectorizer(vocabulary = temp_vocab_2)
    tfidf_2_ = tfidf_vectorizer_2_.fit_transform([pats__[x] for x in temp_pats])
    nmf_ = NMF(n_components=8, random_state=1, alpha=.1,
          l1_ratio=.5, init='nndsvd').fit(tfidf_)
    nmf_1_ = NMF(n_components=8, random_state=1, alpha=.1,
          l1_ratio=.5, init='nndsvd').fit(tfidf_1_)
    nmf_2_ = NMF(n_components=8, random_state=1, alpha=.1,
          l1_ratio=.5, init='nndsvd').fit(tfidf_2_)
    
    nmf_labels_ = [np.argmax(x) for x\
            in nmf_.transform(tfidf_vectorizer_.transform([pats__[x] for x in temp_pats]))]
    nmf_labels_1_ = [np.argmax(x) for x\
            in nmf_1_.transform(tfidf_vectorizer_1_.transform([pats__[x] for x in temp_pats]))]
    nmf_labels_2_ = [np.argmax(x) for x\
            in nmf_2_.transform(tfidf_vectorizer_2_.transform([pats__[x] for x in temp_pats]))]
    nmis.append([nmi(labs, nmf_labels_), 
                 nmi(labs, nmf_labels_1_),
                 nmi(labs, nmf_labels_2_)])
    print(f'NMF:{nmis[-1]}')
    tf_vectorizer_ = CountVectorizer(vocabulary = temp_vocab)
    tf_ = tf_vectorizer_.fit_transform([pats__[x] for x in temp_pats])
    tf_vectorizer_1_ = CountVectorizer(vocabulary = temp_vocab_1)
    tf_1_ = tf_vectorizer_1_.fit_transform([pats__[x] for x in temp_pats])
    tf_vectorizer_2_ = CountVectorizer(vocabulary = temp_vocab_2)
    tf_2_ = tf_vectorizer_2_.fit_transform([pats__[x] for x in temp_pats])
    lda_ = LatentDirichletAllocation(n_components=8, max_iter=200, 
                                learning_method='online', learning_offset=50.,
                                random_state=0).fit(tf_)
    lda_1_ = LatentDirichletAllocation(n_components=8, max_iter=200, 
                                learning_method='online', learning_offset=50.,
                                random_state=0).fit(tf_1_)
    lda_2_ = LatentDirichletAllocation(n_components=8, max_iter=200, 
                                learning_method='online', learning_offset=50.,
                                random_state=0).fit(tf_2_)
    
    lda_labels_ = [np.argmax(x) for x\
            in lda_.transform(tf_vectorizer_.transform([pats__[x] for x in temp_pats]))]
    lda_labels_1_ = [np.argmax(x) for x\
            in lda_1_.transform(tf_vectorizer_1_.transform([pats__[x] for x in temp_pats]))]
    lda_labels_2_ = [np.argmax(x) for x\
            in lda_2_.transform(tf_vectorizer_2_.transform([pats__[x] for x in temp_pats]))]
    nmis_.append([nmi(labs, lda_labels_), 
                 nmi(labs, lda_labels_1_),
                 nmi(labs, lda_labels_2_)])
    print(f'LDA:{nmis_[-1]}\n')

