#**ALTA Shared Task 2018**

In [0]:
import numpy as np
import pandas as pd
import nltk

##**Preparing dataframe for the training data**

In [0]:
train_data=pd.read_csv('train_data.csv')
description=list()
for id in train_data.id:
    with open('patents/'+str(id)+'.txt',encoding='utf8',errors='ignore') as f:
        x=f.read()
        description.append(x.strip())
train_data['description']=description
train_data.first_ipc_mark_section=[ord(x)-64 for x in train_data.first_ipc_mark_section]

print(train_data.head())


##**Converting description into tokens (Tokenizing Data)**

In [0]:

def preprocessor(text):
    __tokenization_pattern = r'''(?x)          # set flag to allow verbose regexps
        \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
      | (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''

    ## call it using tokenizer.tokenize
    tokenizer = nltk.tokenize.regexp.RegexpTokenizer(__tokenization_pattern)
    tokens = tokenizer.tokenize(text.lower())
    alphabet_tokens = [token for token in tokens if token.isalpha()]
    en_stopwords = set(nltk.corpus.stopwords.words('english'))
    non_stopwords = [word for word in alphabet_tokens if not word in en_stopwords]
    stemmer = nltk.stem.snowball.SnowballStemmer("english")
    stems = [str(stemmer.stem(word)) for word in non_stopwords]
    
    return stems

train_data['description_tokens'] = train_data['description'].apply(preprocessor)


In [0]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(lowercase = False, 
                                     tokenizer = lambda x: x, # because we already have tokens available
                                     stop_words = None, ## stop words removal already done from NLTK
                                     max_features = 150000, ## pick top 100K words by frequency
                                     ngram_range = (1, 2), ## we want bigrams now
                                     binary = False) ## we do not want as binary/boolean features



##**Create an ML Pipeline**

In [0]:
from sklearn.pipeline import Pipeline
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import neural_network
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier


c=Pipeline(steps=[('bow',bow_vectorizer),
                  ('tfidf',TfidfTransformer()),
                  ('lr',LogisticRegression(C=40))])


msk = np.random.rand(len(train_data)) < 0.75
train_X = train_data.description_tokens[msk]
test_X = train_data.description_tokens[~msk]
y=train_data['first_ipc_mark_section']
train_y = y[msk]
test_y = y[~msk]

c.fit(train_X,train_y)





##**Checking the accuracy of trained model**

In [0]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

print(f1_score(pd.Series(test_y).values,c.predict(test_X),average='micro'))

0.7018255578093306


##**Using trained model for prediction/inference**

In [0]:
test_data=pd.read_csv('test_data.csv')
description=list()
for id in test_data.id:
    with open('patents/'+str(id)+'.txt',encoding='utf8',errors='ignore') as f:
        x=f.read()
        description.append(x.strip())
test_data['description']=description
#test_data.first_ipc_mark_section=[ord(x)-64 for x in train_data.first_ipc_mark_section]

test_data['description_tokens']=test_data['description'].apply(preprocessor)

In [0]:
test_data['first_ipc_mark_section']=c.predict(test_data.description_tokens)

print(test_data.head())

In [0]:
test_data.first_ipc_mark_section=[chr(x+64) for x in test_data.first_ipc_mark_section]
print(test_data.head())

In [0]:
test_data[['id','first_ipc_mark_section']].to_csv('output.csv',index=None)