# NLP Lab Exercise 1
Jing-Llong Wu / Mtr. Nr: 3045999

# Task1

 ### Load English Data 

In [2]:
import nltk
nltk.download('treebank')
nltk.download('brown')
from nltk.corpus import treebank
from nltk.corpus import brown

# Treebank
treebank_tagged_sents = treebank.tagged_sents()[:1000]
train_size = int(0.8 * len(treebank_tagged_sents))
tb_train_sents = treebank_tagged_sents[:train_size]
tb_test_sents = treebank_tagged_sents[train_size:]

# Brown
brown_tagged_sents = brown.tagged_sents()[:1000]
train_size = int(0.8 * len(brown_tagged_sents))
bn_train_sents = brown_tagged_sents[:train_size]
bn_test_sents = brown_tagged_sents[train_size:]


[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\JingLong\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\JingLong\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [3]:
def features(sentence, index):
    return {
        'word': sentence[index],
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'prefix-1': sentence[index][0],
        'suffix-1': sentence[index][-1],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1]
    }


def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

 
def transform_to_dataset(tagged_sentences):
    untagrtn, featuresrtn, taggedrtn = [], [], []
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            untagrtn.append(untag(tagged)[index])
            featuresrtn.append(features(untag(tagged), index))
            taggedrtn.append(tagged[index][1])
 
    return untagrtn, featuresrtn, taggedrtn

In [4]:
untagrtn_train_tb, featuresrtn_train_tb, taggedrtn_train_tb = transform_to_dataset(tb_train_sents)
untagrtn_test_tb, featuresrtn_test_tb, taggedrtn_test_tb = transform_to_dataset(tb_test_sents)

untagrtn_train_train_bn, featuresrtn_train_bn, taggedrtn_train_bn = transform_to_dataset(bn_train_sents)
untagrtn_train_test_bn, featuresrtn_test_bn, taggedrtn_test_bn = transform_to_dataset(bn_test_sents)


print(len(featuresrtn_train_tb))
print(len(featuresrtn_test_tb))
print()
print(len(featuresrtn_train_bn))
print(len(featuresrtn_test_bn))

19933
5251

17975
4104


## Module 1.Train model based on treebank dataset

In [5]:
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', LogisticRegression())
])

clf.fit(featuresrtn_train_tb, taggedrtn_train_tb)
#print(clf.score(featuresrtn_test_tb, taggedrtn_test_tb))

y=[clf.score(featuresrtn_test_tb, taggedrtn_test_tb), clf.score(featuresrtn_test_bn, taggedrtn_test_bn)]
print(y)

[0.9099219196343553, 0.5414230019493177]


## Module 2. Pretrained model from NLTK

In [6]:
from itertools import chain
import numpy as np
import nltk
nltk.download('maxent_treebank_pos_tagger')

result_tb = nltk.pos_tag(untagrtn_test_tb)
accuracy_tb = np.mean([x[1] == y for x, y in zip(result_tb, taggedrtn_test_tb)])
print(accuracy_tb)

result_bn = nltk.pos_tag(untagrtn_train_test_bn)
accuracy_bn = np.mean([x[1] == y for x, y in zip(result_bn, taggedrtn_test_bn)])
print(accuracy_bn)

[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     C:\Users\JingLong\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!
0.8863073700247572
0.5896686159844055


## Module 3. Rule based classifiers

In [7]:
from nltk.corpus import treebank
from nltk.corpus import brown
from nltk import DefaultTagger as df
from nltk import UnigramTagger as ut
from nltk import BigramTagger as bt
from nltk import TrigramTagger as tg


patterns = [(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'), (r'.*\'s$', 'NN$'),               
             (r'.*s$', 'NNS'), (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')]



def_model = nltk.DefaultTagger('NN')
uni_model = nltk.UnigramTagger(tb_train_sents)
bi_model = nltk.BigramTagger(tb_train_sents)
tri_model = nltk.TrigramTagger(tb_train_sents)
regexp_model = nltk.RegexpTagger(patterns)

rb_models=[def_model,regexp_model,uni_model,bi_model,tri_model]

# performance of Default Tagger
print(def_model.evaluate(tb_train_sents))
print(def_model.evaluate(tb_test_sents))
print()
# performance of Unigram Tagger
print(uni_model.evaluate(tb_train_sents))
print(uni_model.evaluate(tb_test_sents))
print()
# performance of Bigram Tagger
print(bi_model.evaluate(tb_train_sents))
print(bi_model.evaluate(tb_test_sents))
print()
# performance of Trigram Tagger
print(tri_model.evaluate(tb_train_sents))
print(tri_model.evaluate(tb_test_sents))
print()
# performance of Regex Tagger
print(regexp_model.evaluate(tb_train_sents))
print(regexp_model.evaluate(tb_test_sents))
print()

result_rb_treebank=[m.evaluate(tb_test_sents) for m in rb_models]


# Train model based on Brown dataset





brown_uni_model = nltk.UnigramTagger(bn_train_sents)
brown_bi_model = nltk.BigramTagger(bn_train_sents)
brown_tri_model = nltk.TrigramTagger(bn_train_sents)
brown_regexp_model = nltk.RegexpTagger(patterns)

brown_rb_models=[def_model,brown_regexp_model,brown_uni_model,brown_bi_model,brown_tri_model]
# performance of Default Tagger
print(def_model.evaluate(bn_train_sents))
print(def_model.evaluate(bn_test_sents))
print()
# performance of Unigram Tagger
print(uni_model.evaluate(bn_train_sents))
print(uni_model.evaluate(bn_test_sents))
print()
# performance of Bigram Tagger
print(bi_model.evaluate(bn_train_sents))
print(bi_model.evaluate(bn_test_sents))
print()
# performance of Trigram Tagger
print(tri_model.evaluate(bn_train_sents))
print(tri_model.evaluate(bn_test_sents))
print()
# performance of Regex Tagger
print(regexp_model.evaluate(bn_train_sents))
print(regexp_model.evaluate(bn_test_sents))
print()

result_rb_brown=[m.evaluate(bn_test_sents) for m in brown_rb_models]

print(result_rb_treebank)
print(result_rb_brown)

0.12712587167009481
0.12721386402589982

0.9688456328701149
0.7539516282612836

0.9116038729744644
0.06417825176156923

0.9284603421461898
0.04551513997333841

0.21015401595344405
0.2190059036374024

0.13969401947148818
0.13693957115009747

0.4516272600834492
0.44468810916179335

0.024367176634214185
0.020711500974658868

0.015076495132127955
0.011939571150097465

0.2176356050069541
0.2124756335282651

[0.12721386402589982, 0.2190059036374024, 0.7539516282612836, 0.06417825176156923, 0.04551513997333841]
[0.13693957115009747, 0.2124756335282651, 0.7268518518518519, 0.06115984405458089, 0.04191033138401559]


## Performance Results

In [10]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly as py
import plotly.graph_objs
from plotly.graph_objs import *

init_notebook_mode(connected=True)


py.offline.iplot({
    "data": [plotly.graph_objs.Bar( x=['Perfoemance 1.1', 'Perfoemance 1.2', 'Perfoemance 1.4', 'Perfoemance 1.5'], y=[clf.score(featuresrtn_test_tb, taggedrtn_test_tb), accuracy_tb, clf.score(featuresrtn_test_bn, taggedrtn_test_bn), accuracy_bn])],
    "layout": Layout(title="Performance Results")
})


In [11]:
init_notebook_mode(connected=True)


py.offline.iplot({
    "data": [plotly.graph_objs.Bar( x=['Perfoemance 1.3.1', 'Perfoemance 1.3.2', 'Perfoemance 1.3.3', 'Perfoemance 1.3.4','Perfoemance 1.3.5','Perfoemance 1.6.1','Perfoemance 1.6.2','Perfoemance 1.6.3','Perfoemance 1.6.4','Perfoemance 1.6.5'], 
                                   y=result_rb_treebank+result_rb_brown)],
    "layout": Layout(title="Performance Results (Rule Based)")
})

# Task 2

## Chinese Post Tagger model

In [38]:
with open('train.txt','r', encoding = 'utf8') as f:
    tagged = []
    for line in f:
        tagged.append([nltk.tag.str2tuple(t) for t in line.split() if nltk.tag.str2tuple(t)[1] is not None])
    
cn_tagged_sents=tagged[:1000]
train_size = int(0.8 * len(cn_tagged_sents))

cn_train_sents = cn_tagged_sents[:train_size]
cn_test_sents = cn_tagged_sents[train_size:]

# To features
untag_train_cn, feature_train_cn, tag_train_cn = transform_to_dataset(cn_train_sents)
untag_test_cn, feature_test_cn, tag_test_cn = transform_to_dataset(cn_test_sents)

In [39]:
clf_cn = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', LogisticRegression())
])

clf_cn.fit(feature_train_cn, tag_train_cn)
score=clf_id.score(feature_test_cn, tag_test_cn)
print(score)

0.8817931909494607


### Since RDRPOSTagger and TreeTageer don't support the pre-trained model for Chinese , here I use the Standford POS tagger https://nlp.stanford.edu/software/tagger.shtml#About .
### Using gui-command "java -mx200m -cp "stanford-postagger.jar;" edu.stanford.nlp.tagger.maxent.MaxentTaggerGUI models/chinese-nodistsim.tagger" to tag the test data, collect output result to text file "standford_tag.txt"


In [40]:
thefile = open('test.txt', 'w',encoding = 'utf8')
for item in untag_test_cn:
  thefile.write("%s\t" % item)


In [44]:
with open('standford_tag.txt','r', encoding = 'utf8') as f:
    standford_result = []
    for line in f:
        for t in line.split():
            standford_result.append(t.split('#'))
            
#print(standford_result)
acc_cn = np.mean([x[1] == y for x, y in zip(standford_result, tag_test_cn)])
print(acc_cn)

0.031962449709432274


In [48]:
init_notebook_mode(connected=True)


py.offline.iplot({
    "data": [plotly.graph_objs.Bar( x=['Performance 2.1 ', 'Perfoemance 2.2'], y=[score,acc_cn])],
    "layout": Layout(title="Performance Results")
})
