In [5]:
# Read uspto dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

usptodata = pd.read_csv('U.S. Patents.csv')
usptodataset=usptodata[["grant_id","claims_text","abstract"]]
usptodataset= usptodataset.dropna()
usptodataset = usptodataset.reset_index(drop=True)

#split data to have little data to run
# divide dataset to train and test
df_train, df_test = train_test_split(usptodataset, test_size=0.8, random_state=25)
df_train=df_train.reset_index(drop=True)
df_train

Unnamed: 0,grant_id,claims_text,abstract
0,US10459019,"1. An electromagnetic sensor comprising:,a fir...",An electromagnetic sensor includes a first mag...
1,US10456083,1. A method for mapping somatosensory and moto...,An apparatus for cortical mapping and method f...
2,US10461549,"1. A method for charging a mobile terminal, th...","The disclosure discloses a mobile terminal, a ..."
3,US10462815,1. A method for a User Equipment (UE) operatin...,The present invention relates to a wireless co...
4,US10458026,1. A method of producing graphene sheets compr...,A method of producing graphene sheets comprisi...
...,...,...,...
1397,US10458777,1. A method of measuring a metrology target el...,"Targets, target elements and target design met..."
1398,US10458022,1. A method for anti-corrosive treatment of me...,"A method for corrosion protection treatment, c..."
1399,US10462550,"1. A storage device comprising:,a first case c...","A storage device includes a first case, a seco..."
1400,US10456037,1. A terminal device configured to be able to ...,A terminal device is provided which is configu...


In [6]:
from nltk.tokenize import word_tokenize
import nltk 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import string

# NLTK English stopwords
new_stopwords = ["shown", "design", "ornamental", "describe", "described", "described."]
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(new_stopwords)


def cleantext(df): 
    
    df_train['cleaned_text'] = df_train['claims_text'].replace(r'\'|\"|\,|\.|\?|\+|\-|\/|\=|\(|\)|\n|"', '', regex=True)
    df_train['cleaned_text'] = df_train['cleaned_text'].replace("  ", " ")
    
    # convert tweets to lowercase
    df_train['cleaned_text'] = df_train['cleaned_text'].str.lower()
    
    #remove numbers
    df_train['cleaned_text'] =df_train['cleaned_text'].replace(r'\d+', '', regex = True)
        
    #remove_symbols
    df_train['cleaned_text']  = df_train['cleaned_text'].replace(r'[^a-zA-Z0-9]', " ", regex=True)
    
    #remove punctuations 
    df_train['cleaned_text'] = df_train['cleaned_text'].replace(r'[[]!"#$%\'()\*+,-./:;<=>?^_`{|}]+',"", regex = True)
    
    #remove_URL(x):
    df_train['fully_cleaned_text']  = df_train['cleaned_text'].replace(r'https.*$', "", regex = True)
    
    #remove stopwords and words_to_remove
    #df_train['fully_cleaned_text'] = df_train['cleaned_text'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stopwords]))
    
    
    return df_train

all_texts = cleantext(df_train)

  re.compile(obj)


In [7]:
all_texts

Unnamed: 0,grant_id,claims_text,abstract,cleaned_text,fully_cleaned_text
0,US10459019,"1. An electromagnetic sensor comprising:,a fir...",An electromagnetic sensor includes a first mag...,an electromagnetic sensor comprising a first ...,an electromagnetic sensor comprising a first ...
1,US10456083,1. A method for mapping somatosensory and moto...,An apparatus for cortical mapping and method f...,a method for mapping somatosensory and motor ...,a method for mapping somatosensory and motor ...
2,US10461549,"1. A method for charging a mobile terminal, th...","The disclosure discloses a mobile terminal, a ...",a method for charging a mobile terminal the m...,a method for charging a mobile terminal the m...
3,US10462815,1. A method for a User Equipment (UE) operatin...,The present invention relates to a wireless co...,a method for a user equipment ue operating in...,a method for a user equipment ue operating in...
4,US10458026,1. A method of producing graphene sheets compr...,A method of producing graphene sheets comprisi...,a method of producing graphene sheets compris...,a method of producing graphene sheets compris...
...,...,...,...,...,...
1397,US10458777,1. A method of measuring a metrology target el...,"Targets, target elements and target design met...",a method of measuring a metrology target elem...,a method of measuring a metrology target elem...
1398,US10458022,1. A method for anti-corrosive treatment of me...,"A method for corrosion protection treatment, c...",a method for anticorrosive treatment of metal...,a method for anticorrosive treatment of metal...
1399,US10462550,"1. A storage device comprising:,a first case c...","A storage device includes a first case, a seco...",a storage device comprising a first case comp...,a storage device comprising a first case comp...
1400,US10456037,1. A terminal device configured to be able to ...,A terminal device is provided which is configu...,a terminal device configured to be able to co...,a terminal device configured to be able to co...


In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

def get_bigrams(myString):
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(myString)
    stemmer = PorterStemmer()
    bigram_finder = BigramCollocationFinder.from_words(tokens)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500)

    for bigram_tuple in bigrams:
        x = "%s %s" % bigram_tuple
        tokens.append(x)

    result = [' '.join([stemmer.stem(w).lower() for w in x.split()]) for x in tokens if x.lower() not in stopwords.words('english') and len(x) > 8]
    return result

In [9]:
for line in all_texts['fully_cleaned_text']:
    bigrams = get_bigrams(line)

In [10]:
def clean_up(ngrams):
    seen = set()
    for ngram in ngrams:
        if ' ' in ngram:
            seen = seen.union(set(ngram.split()))
    return [ngram for ngram in ngrams if ngram not in seen]


bigrams = clean_up(bigrams)

In [64]:
from sklearn.feature_extraction.text import CountVectorizer

n_wd_bigrams = np.empty((len(bigrams), len(all_texts['fully_cleaned_text'])))

for i in range(len(bigrams)):
    for j in range(len(all_texts['fully_cleaned_text'])):
        n_wd_bigrams[i][j] = all_texts['fully_cleaned_text'][j].count(bigrams[i])
        
cv = CountVectorizer(max_features = len(bigrams), stop_words='english')
n_wd = np.array(cv.fit_transform(all_texts['fully_cleaned_text']).todense()).T
vocabulary = cv.get_feature_names_out()

n_wd = np.concatenate((n_wd, n_wd_bigrams))
vocabulary += bigrams

In [66]:
from scipy.sparse import csr_matrix

bv = artm.BatchVectorizer(data_format='bow_n_wd',
                          n_wd=n_wd[:len(vocabulary)],
                          vocabulary=vocabulary)

  from scipy.sparse.base import spmatrix


In [37]:
#model = artm.LDA(num_topics=15, dictionary=bv.dictionary)
#model.fit_offline(bv, num_collection_passes=20)

In [53]:
import artm
topic_names = ['topic_{}'.format(i) for i in range(10)]

model_artm = artm.ARTM(topic_names = topic_names, cache_theta=True, scores=[artm.PerplexityScore(name='PerplexityScore',dictionary=bv.dictionary), 
                                                                    artm.SparsityPhiScore(name='SparsityPhiScore'),
                                                                    artm.SparsityThetaScore(name='SparsityThetaScore'),
                                                                    artm.TopicKernelScore(name='TopicKernelScore',probability_mass_threshold=0.3), 
                                                                    artm.TopTokensScore(name='TopTokensScore', num_tokens=8)],
                       regularizers=[artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.4),
                                     artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=2.5e+5)])
                                     
model_artm.num_document_passes = 4
model_artm.initialize(bv.dictionary)
model_artm.fit_offline(batch_vectorizer=bv, num_collection_passes=20)

In [55]:
topic_names = ['topic_{}'.format(i) for i in range(50)]

model_artm1 = artm.ARTM(topic_names=topic_names,cache_theta=True, scores=[artm.PerplexityScore(name='PerplexityScore',dictionary=bv.dictionary), artm.SparsityPhiScore(name='SparsityPhiScore'),
                                                                  artm.SparsityThetaScore(name='SparsityThetaScore'), 
                                                                  artm.TopicKernelScore(name='TopicKernelScore',probability_mass_threshold=0.3),
                                                                  artm.TopTokensScore(name='TopTokensScore',num_tokens=12)],
                        regularizers=[artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.4),
                        artm.SmoothSparsePhiRegularizer(name='SparsePhi',tau=-0.25),
                        artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=2.5e+5)], seed=243) #seed is required for heirarchy 
                        
model_artm1.num_document_passes = 4
model_artm1.set_parent_model(parent_model = model_artm, parent_model_weight = 0.75)
model_artm1.initialize(bv.dictionary)

model_artm1.fit_offline(batch_vectorizer=bv, num_collection_passes=12)

subt = pd.DataFrame(model_artm1.get_parent_psi())
subt.columns = ['topic_{}'.format(i) for i in range(10)]
subt.index = ['subtopic_{}'.format(i) for i in range(50)]

In [58]:
def subtopics_wrt_topic(topic_number, matrix_dist):
    return matrix_dist.iloc[:, topic_number].sort_values(ascending = False)[:5]
subtopics_wrt_topic(2, subt)

subtopic_0     0.0
subtopic_37    0.0
subtopic_27    0.0
subtopic_28    0.0
subtopic_29    0.0
Name: topic_2, dtype: float32

In [59]:
def get_articles_on_theme(dataset, topic, num_topics):
    theta = np.array(model_artm.get_theta('topic_{}'.format(topic)).iloc[0]).theta[theta <= 0.05] = 0
    idx = np.nonzero(theta)[0]
    articles = zip(idx, theta[idx])
    articles = sorted(articles, key = lambda x: x[1], reverse = True)
    articles = [x[0] for x in articles]
    return dataset.iloc[articles].PaperText[:num_topics]
    
get_articles_on_theme(all_texts['fully_cleaned_text'], 8, 5)

AttributeError: 'numpy.ndarray' object has no attribute 'theta'

In [60]:
#############################################

In [84]:
batch_vectorizer  = artm.BatchVectorizer(data_format='bow_n_wd',
                          n_wd=n_wd[:len(vocabulary)],
                          vocabulary=vocabulary)

lda = artm.LDA(num_topics=15, alpha=0.01, beta=0.001, cache_theta=True, num_document_passes=5, dictionary=batch_vectorizer.dictionary)
lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=10)

top_tokens = lda.get_top_tokens(num_tokens=10)

for i, token_list in enumerate(top_tokens):

    print('Topic #{0}: {1}'.format(i, token_list))



  from scipy.sparse.base import spmatrix


Topic #0: ['pluralityprocess compris', 'secondgroov by', 'storagea portion', 'locationstrip among', 'vehiclethe cure', 'claimaccord to', 'deviceshare by', 'associatedultraviolet light', 'nodea materi', 'responsesubstrat form']
Topic #1: ['portionelectrod along', 'electrodecorrespond to', 'layerlayer correspond', 'materialare arrang', 'surfaceprocess the', 'metalsame layer', 'accordingdiffer materi', 'formingsubpixel or', 'claimaccord to', 'semiconductorcover the']
Topic #2: ['elementand arrang', 'portionelectrod along', 'pluralityprocess compris', 'claimaccord to', 'elementsare space', 'secondgroov by', 'sensormateri the', 'componentlightemit function', 'structureclaim compris', 'comprisingupper surfac']
Topic #3: ['outputbetween the', 'secondgroov by', 'terminaldevic compris', 'inputcompris form', 'currentsub organ', 'coupledwith each', 'connectedarrang along', 'circuitapart from', 'switchthe second', 'transistorthe ultraviolet']
Topic #4: ['setlayer wherein', 'userportion wherein', '

In [73]:
phi = lda.phi_   # size is number of words in vocab x number of topics
theta = lda.get_theta() # number of rows correspond to the number of topics

print(phi)
print("-----------------------------")
print(theta)

                                topic_0       topic_1       topic_2  \
andaextens direct          2.395767e-03  7.070442e-03  9.343595e-03   
axiswithout be             3.097794e-06  2.025067e-06  4.577391e-04   
claimaccord to             4.947443e-02  3.900435e-02  5.205112e-02   
comprisesupper surfac      2.198097e-03  2.514996e-03  2.772527e-02   
comprisingupper surfac     1.247424e-02  7.995741e-03  3.335467e-02   
...                                 ...           ...           ...   
gatefor cover              1.275919e-07  3.092150e-03  1.568412e-04   
nodea materi               2.955038e-02  6.614570e-08  6.532024e-08   
transistorthe ultraviolet  4.289479e-08  1.051301e-07  7.593392e-08   
devicesare insul           5.030740e-04  4.361320e-07  7.676800e-05   
valvecure the              6.263228e-08  8.307538e-08  5.910277e-08   

                                topic_3       topic_4       topic_5  \
andaextens direct          1.340505e-02  8.949421e-04  2.881870e-03   
axisw

In [77]:
model_artm = artm.ARTM(num_topics=15, cache_theta=True, scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=batch_vectorizer.dictionary)], regularizers=[artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.15)])

#model_plsa.scores.add(artm.TopTokensScore(name='TopTokensScore', num_tokens=6))

model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))

model_artm.scores.add(artm.TopicKernelScore(name='TopicKernelScore', probability_mass_threshold=0.3))

model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore', num_tokens=6))

model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1))

model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5))

model_artm.num_document_passes = 1

model_artm.initialize(dictionary=batch_vectorizer.dictionary)
model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=15)

model_artm.score_tracker['PerplexityScore'].value

[209.13845825195312,
 172.80743408203125,
 111.44688415527344,
 123.51750183105469,
 72.13981628417969,
 71.07655334472656,
 68.26617431640625,
 67.97388458251953,
 67.94954681396484,
 67.74874877929688,
 67.69065856933594,
 67.67171478271484,
 67.65945434570312,
 67.65250396728516,
 67.64883422851562]

In [78]:
for topic_name in model_artm.topic_names:

    print(topic_name + ': ',model_artm.score_tracker['TopTokensScore'].last_tokens[topic_name])


topic_0:  ['storagea portion', 'pathlayer and', 'servicefirst portion']
topic_1:  ['materialare arrang', 'substrateof subpixel', 'electrodecorrespond to', 'formedare altern', 'uppersubpixel a', 'conductiveinsul with']
topic_2:  ['elementsare space', 'gatefor cover', 'signalselectrod are', 'adjacentdisplay devic', 'functionstrip are', 'distanceprocess andor']
topic_3:  ['coupledwith each', 'currentsub organ', 'sourcelayer form', 'responsesubstrat form', 'voltagelayer the', 'supportsubpixel and']
topic_4:  ['selectedthe pixel', 'transmissionwherein a', 'keyconstitut a', 'rateform an', 'configurationclaim wherein']
topic_5:  ['areaorgan lightemit', 'valuesother the', 'pixelwherein upper', 'coverwhich are', 'blockeach other', 'colorcure process']
topic_6:  ['computernot equal', 'targetwherein form', 'opticalthe extens']
topic_7:  ['lengthmanufactur the', 'generatedgroov andform', 'portadjac first', 'compositionfunction layer', 'chamberinsul portion', 'cameraleast one']
topic_8:  ['havingwh

In [79]:
phi_artm = model_artm.phi_   # size is number of words in vocab x number of topics
theta_artm = model_artm.get_theta() # number of rows correspond to the number of topics

print(phi_artm)
print("-----------------------------")
print(theta_artm)

                           topic_0  topic_1   topic_2   topic_3  topic_4  \
andaextens direct              0.0      0.0  0.000000  0.000000      0.0   
axiswithout be                 0.0      0.0  0.000000  0.000000      0.0   
claimaccord to                 0.0      0.0  0.000000  0.000000      0.0   
comprisesupper surfac          0.0      0.0  0.000000  0.000000      0.0   
comprisingupper surfac         0.0      0.0  0.000000  0.000000      0.0   
...                            ...      ...       ...       ...      ...   
gatefor cover                  0.0      0.0  0.163221  0.000000      0.0   
nodea materi                   0.0      0.0  0.000000  0.075932      0.0   
transistorthe ultraviolet      0.0      0.0  0.000000  0.072956      0.0   
devicesare insul               0.0      0.0  0.000000  0.000000      0.0   
valvecure the                  0.0      0.0  0.000000  0.000000      0.0   

                           topic_5  topic_6  topic_7   topic_8  topic_9  \
andaextens d

In [119]:
model_artm = artm.ARTM(num_topics=15, cache_theta=True, scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=batch_vectorizer.dictionary)], regularizers=[artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.4)])

#model_plsa.scores.add(artm.TopTokensScore(name='TopTokensScore', num_tokens=6))

model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))

model_artm.scores.add(artm.TopicKernelScore(name='TopicKernelScore', probability_mass_threshold=0.3))

model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore', num_tokens=6))

model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.4))

model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=2.5e+5))

model_artm.num_document_passes = 1

model_artm.initialize(dictionary=batch_vectorizer.dictionary)
model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=20)

model_artm.score_tracker['PerplexityScore'].value

[209.06007385253906,
 141.7535400390625,
 95.81549072265625,
 104.95429229736328,
 116.63621520996094,
 116.48223114013672,
 116.48223114013672,
 116.48223114013672,
 116.48223114013672,
 116.48223114013672,
 116.48223114013672,
 116.48223114013672,
 116.48223114013672,
 116.48223114013672,
 116.48223114013672,
 116.48223114013672,
 116.48223114013672,
 116.48223114013672,
 116.48223114013672,
 116.48223114013672]

In [120]:
# cooc_dict = artm.Dictionary()
# cooc_dict.gather(
#     data_path='batches_folder',
#     cooc_file_path='cooc.txt',
#     vocab_file_path='vocab.txt',
#     symmetric_cooc_values=True)

# coherence_score = artm.TopTokensScore(
#                             name='TopTokensCoherenceScore',
#                             class_id='@default_class',
#                             num_tokens=10,
#                             topic_names=[u'topic_0',u'topic_1'],
#                             dictionary=cooc_dict)

# model_artm.scores.add(coherence_score)
# model.score_tracker['TopTokensCoherenceScore'].average_coherence