ALTA Shared Task 2018

In [1]:
import numpy as np
import pandas as pd
import nltk

In [2]:
#Preparing dataframe for the training data
train_data=pd.read_csv('train_data.csv')
description=list()
for id in train_data.id:
    with open('patents/'+str(id)+'.txt',encoding='utf8',errors='ignore') as f:
        x=f.read()
        description.append(x.strip())
train_data['description']=description
train_data.first_ipc_mark_section=[ord(x)-64 for x in train_data.first_ipc_mark_section]

print(train_data.head())


   id  first_ipc_mark_section  \
0   0                       1   
1   1                       7   
2   2                       1   
3   3                       1   
4   4                       4   

                                         description  
0  ABSTRACT\n\n  The disclosure relates to a meth...  
1  ABSTRACT\n\n       A system and method are pro...  
2  ABSTRACT\n\nA media module 10 for use with at ...  
3  ABSTRACT\n\n          A support garment having...  
4  THERMALLY REACTIVE THERMOPLASTIC INTERMEDIATE ...  


In [3]:
#Converting description into tokens
def preprocessor(text):
    __tokenization_pattern = r'''(?x)          # set flag to allow verbose regexps
        \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
      | (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''

    ## call it using tokenizer.tokenize
    tokenizer = nltk.tokenize.regexp.RegexpTokenizer(__tokenization_pattern)
    tokens = tokenizer.tokenize(text.lower())
    alphabet_tokens = [token for token in tokens if token.isalpha()]
    en_stopwords = set(nltk.corpus.stopwords.words('english'))
    non_stopwords = [word for word in alphabet_tokens if not word in en_stopwords]
    stemmer = nltk.stem.snowball.SnowballStemmer("english")
    stems = [str(stemmer.stem(word)) for word in non_stopwords]
    
    return stems

train_data['description_tokens'] = train_data['description'].apply(preprocessor)


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(lowercase = False, 
                                     tokenizer = lambda x: x, # because we already have tokens available
                                     stop_words = None, ## stop words removal already done from NLTK
                                     max_features = 150000, ## pick top 100K words by frequency
                                     ngram_range = (1, 2), ## we want bigrams now
                                     binary = False) ## we do not want as binary/boolean features



In [11]:
from sklearn.pipeline import Pipeline
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import neural_network
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier


c=Pipeline(steps=[('bow',bow_vectorizer),
                  ('tfidf',TfidfTransformer()),
                  ('lr',LogisticRegression(C=40))])


msk = np.random.rand(len(train_data)) < 0.75
train_X = train_data.description_tokens[msk]
test_X = train_data.description_tokens[~msk]
y=train_data['first_ipc_mark_section']
train_y = y[msk]
test_y = y[~msk]

c.fit(train_X,train_y)





Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=150000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        str...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [12]:
#Checking the accuracy
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
#print(cross_val_score(clf,text_vec_train,y,cv=5))
print(f1_score(pd.Series(test_y).values,c.predict(test_X),average='micro'))

0.7018255578093306


In [12]:
test_data=pd.read_csv('test_data.csv')
description=list()
for id in test_data.id:
    with open('patents/'+str(id)+'.txt',encoding='utf8',errors='ignore') as f:
        x=f.read()
        description.append(x.strip())
test_data['description']=description
#test_data.first_ipc_mark_section=[ord(x)-64 for x in train_data.first_ipc_mark_section]

test_data['description_tokens']=test_data['description'].apply(preprocessor)

In [13]:
test_data['first_ipc_mark_section']=c.predict(test_data.description_tokens)

print(test_data.head())

     id                                        description  \
0  3972  MEDIA APPLICATION BACKGROUNDING\n\nABSTRACT\n\...   
1  3973  ABSTRACT\n\n      Embodiments of the present d...   
2  3974                                  NA\nparse failure   
3  3975  ABSTRACT\n\n             Thin, biocompatible, ...   
4  3976  5\n\n         Abstract\n\n         Organic syn...   

                                  description_tokens  first_ipc_mark_section  
0  [media, applic, background, abstract, method, ...                       8  
1  [abstract, embodi, present, disclosur, provid,...                       8  
2                                 [na, pars, failur]                       1  
3  [abstract, thin, biocompat, composit, materi, ...                       1  
4  [abstract, organ, synthesi, raw, materi, valer...                       3  


In [14]:
test_data.first_ipc_mark_section=[chr(x+64) for x in test_data.first_ipc_mark_section]
print(test_data.head())

     id                                        description  \
0  3972  MEDIA APPLICATION BACKGROUNDING\n\nABSTRACT\n\...   
1  3973  ABSTRACT\n\n      Embodiments of the present d...   
2  3974                                  NA\nparse failure   
3  3975  ABSTRACT\n\n             Thin, biocompatible, ...   
4  3976  5\n\n         Abstract\n\n         Organic syn...   

                                  description_tokens first_ipc_mark_section  
0  [media, applic, background, abstract, method, ...                      H  
1  [abstract, embodi, present, disclosur, provid,...                      H  
2                                 [na, pars, failur]                      A  
3  [abstract, thin, biocompat, composit, materi, ...                      A  
4  [abstract, organ, synthesi, raw, materi, valer...                      C  


In [15]:
test_data[['id','first_ipc_mark_section']].to_csv('output.csv',index=None)