# Features for the model

There is a tradeoff between number of features and memory. Furthermore, I worked on a machine with not so much memory. Therefore I played around a little bit and kept the best suited features in the model.


In [1]:
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import brown
from nltk import DefaultTagger as df
from nltk import UnigramTagger as ut
from nltk import BigramTagger as bt
from nltk import TrigramTagger as tg

from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot


def features(sentence, index):
    return {
        'word': sentence[index],
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'prefix-1': sentence[index][:1],
        'prefix-2': sentence[index][:2],
      # 'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1:],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'suffix-4': sentence[index][-4:],
      # 'suffix-5': sentence[index][-5:],
      # 'is_all_caps': sentence[index] == sentence[index].upper(),
      # 'contains_number': len([ch for ch in sentence[index] if ch.isdigit()])>0,
        'prev_word': '' if index == 0 else sentence[index - 1],
      # 'prev_word_suffix-1': '' if index == 0 else sentence[index - 1][-1:],
        'prev_word_suffix-2': '' if index == 0 else sentence[index - 1][-2:],
      # 'prev_word_suffix-3': '' if index == 0 else sentence[index - 1][-3:],
      # 'prev_words': '' if index < 2 else sentence[index - 2] + ' ' + sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
      # 'next_word_suffix-1': '' if index == len(sentence) - 1 else sentence[index + 1][-1:],
        'next_word_suffix-2': '' if index == len(sentence) - 1 else sentence[index + 1][-2:],
      # 'next_word_suffix-3': '' if index == len(sentence) - 1 else sentence[index + 1][-3:],
      # 'next_words': '' if index >= len(sentence) - 2 else sentence[index + 1] + ' ' + sentence[index + 2]
    }

results={}

# Task 1.1/1.4 - own classifier on treebank/brown

In [2]:
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
 
    return X, y

treebank_tagged = nltk.corpus.treebank.tagged_sents()
brown_tagged = brown.tagged_sents(categories='news')

treebank_size=len(treebank_tagged)
treebank_train=treebank_tagged[:(int)(treebank_size*0.8)]
treebank_test=treebank_tagged[(int)(treebank_size*0.8):]

brown_size=len(brown_tagged)
brown_test=brown_tagged[:(int)(brown_size*0.8)]
brown_train=brown_tagged[(int)(brown_size*0.8):]

performance={}

for train_data, test_data, corpus_name in [(treebank_train, treebank_test, 'Treebank Corpus'), (brown_train, brown_test[:783], 'Brown Corpus')]:

    print('Perfomance of own model on ' + corpus_name + ':')

    X, y = transform_to_dataset(train_data)

    size=5000

    clf = Pipeline([
        ('vectorizer', DictVectorizer(sparse=False)),
        ('classifier', GaussianNB())
    ])



    print('Train the model...')
    clf.fit(X[:size],y[:size])
    print('Training done.')


    print('Classify test data...')
    score=0.0
    for i in range(0,((int)(len(test_data)/100)+1)):
        if (i+1)*100>len(test_data):
            endval=len(test_data)
        else:
            endval=(i+1)*100

        X_test, y_test = transform_to_dataset(test_data[i*100:endval])
        score+=(endval-i*100)*clf.score(X_test, y_test)

    score=score/len(test_data)
    performance[corpus_name]=score
    print('Classification done. Accuracy: ' + str(score) + '.')
    print()
    print()
    
results['own']=performance


Perfomance of own model on Treebank Corpus:
Train the model...
Training done.
Classify test data...
Classification done. Accuracy: 0.8545015684790763.


Perfomance of own model on Brown Corpus:
Train the model...
Training done.
Classify test data...
Classification done. Accuracy: 0.7558431428741575.




# Task 1.2/1.5 - NLTK tagger on treebank/brown

In [3]:
performance={}

for test_data, corpus_name in [(treebank_test, 'Treebank Corpus'), (brown_test, 'Brown Corpus')]:

    print('Perfomance of model on ' + corpus_name + ':')

    print('Classify test data with NLTK tagger...')

    y_nltk, y = [], []
    for tagged in test_data:
        y_nltk+=nltk.pos_tag([w for w, t in tagged])
        y+=[t for w, t in tagged]

    score=0
    for i in range(0, len(y)):
        if y_nltk[i][1]==y[i]:
            score+=1

    score=score/len(y)
    performance[corpus_name]=score

    print('Classification done. Accuracy: ' + str(score) + '.')
    print()
    print()

results['nltk']=performance
        

Perfomance of model on Treebank Corpus:
Classify test data with NLTK tagger...
Classification done. Accuracy: 0.8937072708218973.


Perfomance of model on Brown Corpus:
Classify test data with NLTK tagger...
Classification done. Accuracy: 0.5874052258913466.




# Task 1.3/1.6 - rule-based classifiers on treebank/brown

In [4]:
patterns = [(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'), (r'.*\'s$', 'NN$'),               
             (r'.*s$', 'NNS'), (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')]

performance_def=performance_uni=performance_bi=performance_tri=performance_regex={}
for train_sents, test_sents, corpus_name in [(treebank_train, treebank_test, 'Treebank Corpus'), (brown_train, brown_test, 'Brown Corpus')]:

    print('Perfomance of model on ' + corpus_name + ':')
    print()
    

    def_model = nltk.DefaultTagger('NN')
    uni_model = nltk.UnigramTagger(train_sents)
    bi_model = nltk.BigramTagger(train_sents)
    tri_model = nltk.TrigramTagger(train_sents)
    regexp_model = nltk.RegexpTagger(patterns)

    performance=def_model.evaluate(test_sents)
    print('Performance of Default Tagger:')
    print('on training data: ' + str(def_model.evaluate(train_sents)))
    print('on test data: ' + str(performance))
    performance_def[corpus_name]=performance
    print()
          
    performance=uni_model.evaluate(test_sents)
    print('Performance of Unigram Tagger:')
    print('on training data: ' + str(uni_model.evaluate(train_sents)))
    print('on test data: ' + str(performance))
    performance_uni[corpus_name]=performance
    print()
    
    performance=bi_model.evaluate(test_sents)
    print('Performance of Bigram Tagger:')
    print('on training data: ' + str(bi_model.evaluate(train_sents)))
    print('on test data: ' + str(performance))
    performance_bi[corpus_name]=performance
    print()
    
    performance=tri_model.evaluate(test_sents)
    print('Performance of Trigram Tagger:')
    print('on training data: ' + str(tri_model.evaluate(train_sents)))
    print('on test data: ' + str(performance))
    print()
    performance_tri[corpus_name]=performance
    
    performance=regexp_model.evaluate(test_sents)
    print('Performance of Regex Tagger:')
    print('on training data: ' + str(regexp_model.evaluate(train_sents)))
    print('on test data: ' + str(performance))
    performance_regex[corpus_name]=performance
    print()
    print()
    
results['def']=performance_def
results['uni']=performance_uni
results['bi']=performance_bi
results['tri']=performance_tri
results['regex']=performance_regex
    


Perfomance of model on Treebank Corpus:

Performance of Default Tagger:
on training data: 0.12729888264692388
on test data: 0.1447677029791906

Performance of Unigram Tagger:
on training data: 0.9597455262472562
on test data: 0.8604720794450821

Performance of Bigram Tagger:
on training data: 0.9087143618934236
on test data: 0.11322920305404462

Performance of Trigram Tagger:
on training data: 0.9089747882485708
on test data: 0.06706921503069016

Performance of Regex Tagger:
on training data: 0.2138472413408237
on test data: 0.24232746145017217


Perfomance of model on Brown Corpus:

Performance of Default Tagger:
on training data: 0.1209595837949805
on test data: 0.1334795413246444

Performance of Unigram Tagger:
on training data: 0.9475408256659762
on test data: 0.6978006140735635

Performance of Bigram Tagger:
on training data: 0.7955103810395491
on test data: 0.05351212481985087

Performance of Trigram Tagger:
on training data: 0.8671901343995375
on test data: 0.039187919042546523


# Plotting the results for Task 1

In [5]:
init_notebook_mode(connected=True)

x1='Treebank Corpus'
x2='Brown Corpus'
trace1 = go.Bar(
    x=['own Tagger', 'NLTK tagger', 'Default Tagger', 'Unigram Tagger', 'Bigram Tagger', 'Trigram Tagger', 'Regex Tagger'],
    y=[results['own'][x1], results['nltk'][x1], results['def'][x1], results['uni'][x1], results['bi'][x1], results['tri'][x1], results['regex'][x1]],
    name=x1
)
trace2 = go.Bar(
    x=['own Tagger', 'NLTK tagger', 'Default Tagger', 'Unigram Tagger', 'Bigram Tagger', 'Trigram Tagger', 'Regex Tagger'],
    y=[results['own'][x2], results['nltk'][x2], results['def'][x2], results['uni'][x2], results['bi'][x2], results['tri'][x2], results['regex'][x2]],
    name=x2
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='grouped-bar')

# Task 2 - importing a Spanish corpus 

The IULA Corpus needs to be imported using the ConllCorpusReader. Since the format of the POS tags is different from those assigned by the RDRPOSTagger used in Task 2.2, the POS tags will be adjusted.

In [6]:
corp = nltk.corpus.ConllCorpusReader('RDRPOSTagger-master/data/es/', ['IULA_Spanish_LSP_Treebank.conll'], ['ignore', 'words', 'ignore', 'pos', 'ignore', 'ignore', 'ignore', 'ignore', 'ignore', 'ignore'])

def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

file=open('RDRPOSTagger-master/data/es/IULA_Corpus', 'w')
for sent in corp.sents():
    for word in sent:
        file.write(word + ' ')
    file.write('\n')
iula_sents=corp.tagged_sents()

IULA_tagged_words=[]
tagged_words=corp.tagged_words()
for i in range(0, len(tagged_words)):
    tag=tagged_words[i][1]
    if tag in ['n', 'i', '_']:
        new_tag='NOUN'
    elif tag=='v':
        new_tag='VERB'
    elif tag=='a':
        new_tag='ADJ'
    elif tag=='d':
        new_tag='DET'
    elif tag=='c':
        new_tag='CONJ'
    elif tag=='s':
        new_tag='ADP'
    elif tag=='f':
        new_tag='PUNCT'
    elif tag in ['z', 'w']:
        new_tag='NUM'
    elif tag=='p':
        new_tag='PRON'
    elif tag=='r':
        new_tag='ADV'
    else:
        new_tag=tag
    IULA_tagged_words.append((tagged_words[i][0], new_tag))

iula_size=len(iula_sents)
iula_train=iula_sents[:(int)(0.8*iula_size)]
iula_test=iula_sents[(int)(0.8*iula_size):]

results={}

# Task 2.1 own POS tagger model on IULA Corpus

I played around with common suffixes in the feature model for the spanish corpus. To my surprise the model performed better without the added features. I did not check all different configurations but of those I tried the original feature model performed best.

In [7]:
print('Perfomance of own model on IULA Corpus:')

size=5000

clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', GaussianNB())
])

def feat_span(sentence, index):
    return {
        'word': sentence[index],
        'is_first': index==0,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'stem': '',
        'suffix': '',
        'prefix-1': sentence[index][:1],
        'prefix-2': sentence[index][:2],
      # 'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1:],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'suffix-4': sentence[index][-5:],
      # 'is_all_caps': sentence[index] == sentence[index].upper(),
      # 'contains_number': len([ch for ch in sentence[index] if ch.isdigit()])>0,
        'prev_word': '' if index == 0 else sentence[index - 1],
      # 'prev_stem': '',
      #  'prev_suffix': '',
      # 'prev_word_suffix-1': '' if index == 0 else sentence[index - 1][-1:],
        'prev_word_suffix-2': '' if index == 0 else sentence[index - 1][-2:],
      # 'prev_word_suffix-3': '' if index == 0 else sentence[index - 1][-3:],
      # 'prev_words': '' if index < 2 else sentence[index - 2] + ' ' + sentence[index - 1],
      # 'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
      # 'next_stem': '',
      # 'next_suffix': '',
      # 'next_word_suffix-1': '' if index == len(sentence) - 1 else sentence[index + 1][-1:],
        'next_word_suffix-2': '' if index == len(sentence) - 1 else sentence[index + 1][-2:],
      # 'next_word_suffix-3': '' if index == len(sentence) - 1 else sentence[index + 1][-3:],
      # 'next_words': '' if index >= len(sentence) - 2 else sentence[index + 1] + ' ' + sentence[index + 2]
    }

comm_suffixes={'@', 'a', 'aba', 'abais', 'ábamos', 'aban', 'abas', 'able', 'ácea', 'áceo', 'acha', 'acho', 'ación', 'ada',
               'adgo', 'ado', 'ador', 'adora', 'adura', 'áis', 'aje', 'ajo', 'al', 'algia', ' amento', 'amiento', 'amos',
               'an', 'ana', 'ancia', 'ando', 'ano', 'ante', 'anza', 'ar', 'aran', 'arca', 'areis', 'ario', 'aron', 'aré',
               'as', 'asa', 'astro', 'ata', 'ato', 'avo', 'aza', 'azgo', 'azo', 'bilidad', 'ceta', 'cete', 'cida', 'cidio',
               'cigótico', 'cilla', 'cillo', 'ción', 'cita', 'cito', 'clasa', 'cola', 'cracia', 'crata', 'dad', 'dero',
               'dor', 'dora', 'dura', 'ear', 'ececilla', 'ececillo', 'ececita', 'ececito', 'ecer', 'ecilla', 'ecillo',
               'ecita', 'ecito', 'ectomía', 'eda', 'edad', 'edo', 'edor', 'edora', 'edura', 'éis', 'ejo', 'emos', 'en',
               'eña', 'eno', 'eño', 'ense', 'ente', 'eo', 'er', 'era', 'ería', 'ero', 'eré', 'eréis', 'es', 'és', 'esa',
               'esca', 'esco', 'eta', 'ete', 'ez', 'eza', 'ezna', 'ezno', 'faga', 'fago', 'fila', 'filia', 'filo', 'fito',
               'fobia', 'fobo', 'fono', 'forme', 'geno', 'grafía', 'grafo', 'grama', 'génesis', 'í', 'ia', 'ía', 'íais',
               'íamos', 'ían', 'iano', 'ías', 'iatra', ' iatría', 'ible', 'ichuela', 'ico', 'idad', 'ido', 'idor',
               'idora', 'idura', 'iego', 'iendo', 'iente', 'ieron', 'ificar', 'il', 'illa', 'illo', 'ilo', 'imento',
               'imiento', 'imos', 'ín', 'ina', 'ing', 'ino', 'io', 'ió', 'ío', 'iré', 'iréis', 'ísima', 'ísimas',
               'ísimo', 'ísimos', 'ismo', 'ista', 'iste', 'isteis', 'ita', 'itis', 'ito', 'itud', 'ivo', 'iza',
               'ización', 'izar', 'izo', 'landia', 'latría', 'lita', 'lito', 'loga', 'logía', 'lógico', 'logo',
               'mana', 'mancia', 'mancía', 'manía', 'mano', 'mente', 'mento', 'metría', 'metro', 'miento', 'morfa',
               'morfo', 'nauta', 'nte', 'o', 'ó', 'oico', 'oide', 'oma', 'on', 'ón', 'ona', 'onas', 'ónimo', 'or',
               'osa', 'oso', 'ota', 'ote', 'pata', 'patía', 'plastia', 'podo', 's', 'saurio', 'sca', 'sco', 'scopia',
               'scopía', 'scópico', 'scopio', 'teca', 'tecnia', 'terapia', 'toma', 'tomía', 'tomo', 'trofa', 'trofia',
               'trofo', 'ucha', 'ucho', 'uco', 'udo', 'uela', 'uelo', 'ura', 'uro', 'usco', 'xión', 'yendo', 'zón',
               'zoo', 'zuela'}


def assign_suffixes(X):
    for i in range(0, len(X)):
        for suf in comm_suffixes:
            if X[i]['word'][-len(suf):].lower()==suf:
                X[i]['suffix']=suf
        X[i]['stem']=X[i]['word'][:-len(X[i]['suffix'])].lower()
        if not X[i]['is_first']:
                X[i]['prev_stem']=X[i-1]['stem']
                X[i]['prev_suffix']=X[i-1]['suffix']
                X[i-1]['next_stem']=X[i]['stem']
                X[i-1]['next_suffix']=X[i]['suffix']
        return X
            
def transform_to_dataset_span(tagged_sentences):
    X, y = [], []
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(feat_span(untag(tagged), index))
            y.append(tagged[index][1])
    
    return assign_suffixes(X), y


X, y = transform_to_dataset_span(iula_train)

print('Train the model...')
clf.fit(X[:size],y[:size])
print('Training done.')


print('Classify test data...')
score=0.0
for i in range(0,((int)(len(iula_test)/100)+1)):
    if (i+1)*100>len(iula_test):
        endval=len(iula_test)
    else:
        endval=(i+1)*100

    X_test, y_test = transform_to_dataset_span(iula_test[i*100:endval])

            
    score+=(endval-i*100)*clf.score(X_test, y_test)

score=score/len(iula_test)
print('Classification done. Accuracy: ' + str(score) + '.')

results['own']=score

Perfomance of own model on IULA Corpus:
Train the model...
Training done.
Classify test data...
Classification done. Accuracy: 0.8217944417247102.


# Task 2.2 - RDRPOSTagger on IULA Corpus

The subprocess call for the RDRPOSTagger is commented out since it requires root privileges and therefore should not be executed in Jupyter. It is just there for demonstration purposes and was executed in a separate shell.

In [8]:
print('Perfomance of RDR POS Tagger on IULA Corpus:')


'''
arg1='RDRPOSTagger-master/data/es/es-upos.RDR'
arg2='RDRPOSTagger-master/data/es/es-upos.DICT'
arg3='RDRPOSTagger-master/data/es/IULA_Corpus'
subprocess.call(['RDRPOSTagger-master/pSCRDRtagger/RDRPOSTagger.py', arg1, arg2, arg3])
'''
file=open('RDRPOSTagger-master/data/es/IULA_Corpus.TAGGED', 'r')

RDRPOS_res=[]
for line in file:
    sent=line.split(' ')
    for tagged_word in sent:
        it=tagged_word.rfind('/')
        word=tagged_word[:it]
        tag=tagged_word[it+1:]
        if tag[-1:]=='\n':
            tag=tag[:-1]
        if tag in ['PROPN', 'X']:
            tag='NOUN'
        if tag=='AUX':
            tag='VERB'
        if tag in ['CCONJ', 'SCONJ']:
            tag='CONJ'
        if tag=='SYM':
            tag='ADP'
        RDRPOS_res.append((word, tag))
        
print('Classify test data...')
score=0
for i in range(0,len(RDRPOS_res)):
    if RDRPOS_res[i][1]==IULA_tagged_words[i][1]:
        score+=1
score=score/len(RDRPOS_res)
print('Classification done. Accuracy: ' + str(score) + '.')
results['rdr']=score

Perfomance of RDR POS Tagger on IULA Corpus:
Classify test data...
Classification done. Accuracy: 0.931034424211554.


# Plotting the results for Task 2

In [9]:
data = [go.Bar(
            x=['own Tagger', 'RDR POS Tagger'],
            y=[results['own'], results['rdr']]
    )]

iplot(data, filename='basic-bar')