In [78]:
from nltk.tree import Tree
from nlp_id import tokenizer
import pickle
import os
import nltk
import wget
# default classifier
from sklearn import ensemble
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline


class PosTag:
    def __init__(self, model_path="postagger.pkl"):
        self.current_dir = os.path.dirname(os.path.realpath("__filename__"))
#         url = "https://storage.googleapis.com/kumparan-public-bucket/nlp-id/postagger_v6.pkl"
#         wget.download(url, model_path)
        self.clf = self.load_model(model_path)
        self.tokenizer = tokenizer.Tokenizer()
        self.method = method

    def load_model(self,model_path):
        pickle_in = open(model_path,"rb")
        load_data = pickle.load(pickle_in)
        return load_data

    def features(self, sentence, index):
        """ sentence: [w1, w2, ...], index: the index of the word """
        return {
            'word': sentence[index],
            'is_first': index == 0,
            'is_last': index == len(sentence) - 1,
            'is_capitalized': sentence[index][0].upper() == sentence[index][0],
            'is_all_caps': sentence[index].upper() == sentence[index],
            'is_all_lower': sentence[index].lower() == sentence[index],
            'has_hyphen': '-' in sentence[index],
            'is_numeric': sentence[index].isdigit(),
            'capitals_inside': sentence[index][1:].lower() != sentence[index][1:],
            'prefix-1': sentence[index][0],
            'prefix-1-lower': sentence[index][0].lower(),
            'prefix-2': sentence[index][:2],
            'prefix-2-lower': sentence[index][:2].lower(),
            'prefix-3': sentence[index][:3],
            'prefix-3-lower': sentence[index][:3].lower(),
            'suffix-1': sentence[index][-1],
            'suffix-1-lower': sentence[index][-1].lower(),
            'suffix-2': sentence[index][-2:],
            'suffix-2-lower': sentence[index][-2:].lower(),
            'suffix-3': sentence[index][-3:],
            'suffix-3-lower': sentence[index][-3:].lower(),
            'lowercase_word': sentence[index].lower(),
            'prev_word': '' if index == 0 else sentence[index - 1],
            'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        }

    def get_pos_tag(self, text):
        result = []
        sents = nltk.sent_tokenize(text)
        symbols = ['!', '&', '(', ')', '*', '?', ',', '.', '<', '>', '/', ':', ';',
                   '[', ']', '\\', '^', '`', '{', '}', '|', '~', '"', '“', "'"]
        for sent in sents:
            tokenized_word = self.tokenizer.tokenize(sent)
            if sent:
                tags = self.clf.predict([self.features(tokenized_word, index) for index in range(len(tokenized_word))])
                for i in range(len(tags)):
                    if tokenized_word[i] in symbols:
                        result.append((tokenized_word[i], 'SYM'))
                    else:
                        result.append((tokenized_word[i], tags[i]))
        return result
    
    def tree_to_list(self, tree_data):
        result = []
        for subtree in tree_data:
            if type(subtree) == Tree:
                phrase = " ".join([token for token, pos in subtree.leaves()])
                result.append((phrase, subtree.label()))
            else:
                result.append((subtree[0], subtree[1]))
        return result

    def chunk_tag(self, tag):
        chunk_rule = '''
            DP: {<NUM><NNP><NUM>}
            NP: {<NNP><NNP>+}
            NP: {<NN>+<JJ>}
            NP: {<FW><FW>+}
            NP: {<NP><NP>+}
            ADJP: {<JJ><ADV>}
            ADJP: {<ADV><JJ>}
            ADJP: {<JJ>+}
            ADJP: {<NEG>*<ADJP>}
            VP: {<NEG>*<VB>}
            NUMP: {<NUM><NUM>+}
            '''
        chunkParser= nltk.RegexpParser(chunk_rule)
        tree = chunkParser.parse(tag)
        result = self.tree_to_list(tree)
        return result
    
    def get_phrase_tag(self,text):
        if text:
            tag = self.get_pos_tag(text)
            phrase_tag = self.chunk_tag(tag)
        else:
            phrase_tag = []
        return phrase_tag

    def read_dataset(self, dataset_path=None):
        if not dataset_path:
            dataset_path = os.path.join(self.current_dir, 'nlp_id', 'data', 'dataset_postag.txt')

        with open(dataset_path) as f:
            raw_file = f.read().split("\n")

        files = [i.split("\t") for i in raw_file]

        sentences, tags, temp_sentences, temp_tags = [], [], [], []

        for file in files:
            if file != [""]:
                temp_sentences.append(file[0]) # get the sentences
                temp_tags.append(file[1]) # get the tag
            else:
                # check if the temp sentences and temp tags is not null and both of them have the same length
                if len(temp_sentences) > 0 and (len(temp_sentences) == len(temp_tags)):
                    sentences.append(temp_sentences)
                    tags.append(temp_tags)
                temp_sentences, temp_tags = [], []
        return sentences, tags
    
    def transform_to_dataset(self, sentences, tags):
        X, y = [], []

        for sentence_idx in range(len(sentences)):
            for index in range(len(sentences[sentence_idx])):
                X.append(self.features(sentences[sentence_idx], index))
                y.append(tags[sentence_idx][index])

        return X, y

    def train(self, sentences, tags):
        if self.method == "svc":
            self.clf = Pipeline([
                ('vectorizer', DictVectorizer(sparse=True)),
                ('classifier', LinearSVC(C=4, dual=False, random_state=2020))
            ])
        elif self.method == "random_forest":
            self.clf = Pipeline([
                ('vectorizer', DictVectorizer(sparse=True)),
                ('classifier', ensemble.RandomForestClassifier(criterion='gini', n_estimators=15, random_state=2020))
            ])
        else:
            raise

        self.clf.fit(sentences, tags)
        
    def save_model(self,model_path):
        pickle_out = open(model_path, "wb")
        pickle.dump(self.clf, pickle_out)
        pickle_out.close()


In [18]:
# url = "https://storage.googleapis.com/kumparan-public-bucket/nlp-id/postagger_v6.pkl"
# wget.download(url, "postagger.pkl")

# Existing model (Random Forest)

In [19]:
postag = PosTag()

In [20]:
s, t = postag.read_dataset()

In [21]:
s2, t2 = postag.transform_to_dataset(s,t)

In [22]:
from sklearn.model_selection import cross_val_score, KFold

In [23]:
vect = DictVectorizer(sparse=True)

In [24]:
s3 = vect.fit_transform(s2)

In [27]:
clf = ensemble.RandomForestClassifier(criterion='gini', n_estimators=15, random_state=2020)

In [28]:
# from sklearn.svm import LinearSVC

# clf = LinearSVC(C=4, dual=False, random_state=2020)

In [29]:
cv = KFold(n_splits=5, shuffle=True, random_state=2020)

In [30]:
check = cross_val_score(clf, s3, t2, cv=cv, scoring="f1_macro", verbose=True, n_jobs=4)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed: 16.8min finished


In [31]:
check

array([0.99566112, 0.99609849, 0.93895801, 0.9936406 , 0.99021064])

In [90]:
f1_score = sum(check)/len(check)
f1_score

0.9829137717212463

# Linear SVC

In [35]:
from sklearn.svm import LinearSVC

svc_clf = LinearSVC(C=4, dual=False, random_state=2020)

In [36]:
svc_check = cross_val_score(svc_clf, s3, t2, cv=cv, scoring="f1_macro", verbose=True, n_jobs=4)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed: 14.2min finished


In [91]:
f1_score = sum(svc_check)/len(svc_check)
f1_score

0.9954005524803555

# Test

In [56]:
postag_rf = PosTag()

In [57]:
s, t = postag_rf.read_dataset()
s2, t2 = postag_rf.transform_to_dataset(s,t)

In [58]:
postag_rf.train(s2, t2)

In [66]:
modelpath = "postagger_research_rf"

In [67]:
postag_rf.save_model(modelpath)

In [68]:
postag_rf.get_phrase_tag("Dilansir dari Antara Foto, Ketua Asosiasi Pengelola Pusat Belanja Indonesia (APPBI) DKI Ellen Hidayat mengatakan sebagai upaya pencegahan penyebaran virus COVID-19 di ruang publik, pusat perbelanjaan Plaza Indonesia akan tutup sementara mulai 25 Maret hingga 3 April 2020.")

[('Dilansir', 'VP'),
 ('dari', 'IN'),
 ('Antara', 'NNP'),
 ('Foto', 'NN'),
 (',', 'SYM'),
 ('Ketua Asosiasi Pengelola Pusat Belanja Indonesia', 'NP'),
 ('(', 'SYM'),
 ('APPBI', 'NNP'),
 (')', 'SYM'),
 ('DKI Ellen Hidayat', 'NP'),
 ('mengatakan', 'VP'),
 ('sebagai', 'IN'),
 ('upaya', 'NN'),
 ('pencegahan', 'NN'),
 ('penyebaran', 'NN'),
 ('virus', 'NN'),
 ('COVID-19', 'NNP'),
 ('di', 'IN'),
 ('ruang', 'NN'),
 ('publik', 'NN'),
 (',', 'SYM'),
 ('pusat', 'NN'),
 ('perbelanjaan', 'NN'),
 ('Plaza Indonesia', 'NP'),
 ('akan', 'ADV'),
 ('tutup', 'VP'),
 ('sementara', 'CC'),
 ('mulai', 'VP'),
 ('25', 'NUM'),
 ('Maret', 'NNP'),
 ('hingga', 'IN'),
 ('3 April 2020', 'DP'),
 ('.', 'SYM')]

In [48]:
# import nltk
# nltk.download("punkt")

In [69]:
postag_rf.get_phrase_tag("Menyusul Pandemi Covid-19, sejumlah objek wisata di Kecamatan Ciracap, Kabupaten Sukabumi ditutup untuk sementara waktu.")

[('Menyusul', 'VP'),
 ('Pandemi Covid-19', 'NP'),
 (',', 'SYM'),
 ('sejumlah', 'NUM'),
 ('objek', 'NN'),
 ('wisata', 'NN'),
 ('di', 'IN'),
 ('Kecamatan Ciracap', 'NP'),
 (',', 'SYM'),
 ('Kabupaten Sukabumi', 'NP'),
 ('ditutup', 'VP'),
 ('untuk', 'SC'),
 ('sementara', 'CC'),
 ('waktu', 'NN'),
 ('.', 'SYM')]

In [70]:
postag_rf.get_phrase_tag("Informasi terakhir dari Jubir Penanganan Corona Achmad Yurianto beberapa hari lalu, kondisi Budi Karya membaik.")

[('Informasi', 'NN'),
 ('terakhir', 'NUM'),
 ('dari', 'IN'),
 ('Jubir Penanganan Corona Achmad Yurianto', 'NP'),
 ('beberapa', 'NUM'),
 ('hari', 'NN'),
 ('lalu', 'CC'),
 (',', 'SYM'),
 ('kondisi', 'NN'),
 ('Budi Karya', 'NP'),
 ('membaik', 'VP'),
 ('.', 'SYM')]

In [71]:
postag_rf.get_phrase_tag("Juga Nationale Dodenherdenking (Hari Nasional Mengenang Mereka yang Gugur, red) di alun-alun Dam, Amsterdam, akan diselenggarakan dengan format tidak seperti biasa.")

[('Juga', 'ADV'),
 ('Nationale Dodenherdenking', 'NP'),
 ('(', 'SYM'),
 ('Hari Nasional Mengenang', 'NP'),
 ('Mereka', 'PR'),
 ('yang', 'SC'),
 ('Gugur', 'NNP'),
 (',', 'SYM'),
 ('red', 'FW'),
 (')', 'SYM'),
 ('di', 'IN'),
 ('alun-alun', 'NN'),
 ('Dam', 'NNP'),
 (',', 'SYM'),
 ('Amsterdam', 'NNP'),
 (',', 'SYM'),
 ('akan', 'ADV'),
 ('diselenggarakan', 'VP'),
 ('dengan', 'IN'),
 ('format', 'NN'),
 ('tidak', 'NEG'),
 ('seperti', 'IN'),
 ('biasa', 'ADJP'),
 ('.', 'SYM')]

In [72]:
postag_rf.get_phrase_tag("Salah satu pasien positif corona di Yogyakarta Iwan Dwiprahasto meninggal dunia pada Selasa (24/3/2020)")

[('Salah', 'ADJP'),
 ('satu', 'NUM'),
 ('pasien positif', 'NP'),
 ('corona', 'NN'),
 ('di', 'IN'),
 ('Yogyakarta Iwan Dwiprahasto', 'NP'),
 ('meninggal', 'VP'),
 ('dunia', 'NN'),
 ('pada', 'IN'),
 ('Selasa', 'NNP'),
 ('(', 'SYM'),
 ('24/3/2020', 'NUM'),
 (')', 'SYM')]

In [73]:
postag_rf.get_phrase_tag("Pegawai Terindikasi Corona, Bank Mandiri Setop Operasional Cabang Kyai Tapa")

[('Pegawai', 'NNP'),
 ('Terindikasi', 'VP'),
 ('Corona', 'NNP'),
 (',', 'SYM'),
 ('Bank Mandiri Setop Operasional Cabang Kyai Tapa', 'NP')]

In [74]:
postag_rf.get_phrase_tag("Pekan lalu, Otoritas Jasa Keuangan atau OJK menerbitkan Peraturan OJK (POJK) No.11/POJK.03/2020 tentang Stimulus Perekonomian Nasional Sebagai Kebijakan Countercyclical Dampak Penyebaran Coronavirus Disease 2019")

[('Pekan', 'NNP'),
 ('lalu', 'CC'),
 (',', 'SYM'),
 ('Otoritas Jasa Keuangan', 'NP'),
 ('atau', 'CC'),
 ('OJK', 'NNP'),
 ('menerbitkan', 'VP'),
 ('Peraturan OJK', 'NP'),
 ('(', 'SYM'),
 ('POJK', 'NNP'),
 (')', 'SYM'),
 ('No', 'NNP'),
 ('.', 'SYM'),
 ('11/POJK', 'NUM'),
 ('.', 'SYM'),
 ('03/2020', 'NUM'),
 ('tentang', 'VP'),
 ('Stimulus Perekonomian Nasional', 'NP'),
 ('Sebagai', 'IN'),
 ('Kebijakan', 'NN'),
 ('Countercyclical Dampak', 'NP'),
 ('Penyebaran', 'NN'),
 ('Coronavirus Disease', 'NP'),
 ('2019', 'NUM')]

In [79]:
postag_svc = PosTag(method="svc")

In [80]:
s, t = postag_svc.read_dataset()
s2, t2 = postag_svc.transform_to_dataset(s,t)

In [81]:
postag_svc.train(s2, t2)



In [96]:
postag_svc.save_model("postagger_research_svm.pkl")

In [84]:
postag_svc.get_phrase_tag("Dilansir dari Antara Foto, Ketua Asosiasi Pengelola Pusat Belanja Indonesia (APPBI) DKI Ellen Hidayat mengatakan sebagai upaya pencegahan penyebaran virus COVID-19 di ruang publik, pusat perbelanjaan Plaza Indonesia akan tutup sementara mulai 25 Maret hingga 3 April 2020.")

[('Dilansir', 'VP'),
 ('dari', 'IN'),
 ('Antara Foto', 'NP'),
 (',', 'SYM'),
 ('Ketua Asosiasi Pengelola Pusat Belanja Indonesia', 'NP'),
 ('(', 'SYM'),
 ('APPBI', 'NNP'),
 (')', 'SYM'),
 ('DKI Ellen Hidayat', 'NP'),
 ('mengatakan', 'VP'),
 ('sebagai', 'IN'),
 ('upaya', 'NN'),
 ('pencegahan', 'NN'),
 ('penyebaran', 'NN'),
 ('virus', 'NN'),
 ('COVID-19', 'NNP'),
 ('di', 'IN'),
 ('ruang', 'NN'),
 ('publik', 'NN'),
 (',', 'SYM'),
 ('pusat', 'NN'),
 ('perbelanjaan', 'NN'),
 ('Plaza Indonesia', 'NP'),
 ('akan', 'ADV'),
 ('tutup', 'VP'),
 ('sementara', 'CC'),
 ('mulai', 'VP'),
 ('25', 'NUM'),
 ('Maret', 'NNP'),
 ('hingga', 'IN'),
 ('3 April 2020', 'DP'),
 ('.', 'SYM')]

In [85]:
postag_svc.get_phrase_tag("Menyusul Pandemi Covid-19, sejumlah objek wisata di Kecamatan Ciracap, Kabupaten Sukabumi ditutup untuk sementara waktu.")

[('Menyusul', 'VP'),
 ('Pandemi Covid-19', 'NP'),
 (',', 'SYM'),
 ('sejumlah', 'NUM'),
 ('objek', 'NN'),
 ('wisata', 'NN'),
 ('di', 'IN'),
 ('Kecamatan Ciracap', 'NP'),
 (',', 'SYM'),
 ('Kabupaten Sukabumi', 'NP'),
 ('ditutup', 'VP'),
 ('untuk', 'SC'),
 ('sementara', 'CC'),
 ('waktu', 'NN'),
 ('.', 'SYM')]

In [86]:
postag_svc.get_phrase_tag("Informasi terakhir dari Jubir Penanganan Corona Achmad Yurianto beberapa hari lalu, kondisi Budi Karya membaik.")

[('Informasi', 'NN'),
 ('terakhir', 'NUM'),
 ('dari', 'IN'),
 ('Jubir Penanganan Corona Achmad Yurianto', 'NP'),
 ('beberapa', 'NUM'),
 ('hari', 'NN'),
 ('lalu', 'CC'),
 (',', 'SYM'),
 ('kondisi', 'NN'),
 ('Budi Karya', 'NP'),
 ('membaik', 'VP'),
 ('.', 'SYM')]

In [87]:
postag_svc.get_phrase_tag("Juga Nationale Dodenherdenking (Hari Nasional Mengenang Mereka yang Gugur, red) di alun-alun Dam, Amsterdam, akan diselenggarakan dengan format tidak seperti biasa.")

[('Juga', 'ADV'),
 ('Nationale Dodenherdenking', 'NP'),
 ('(', 'SYM'),
 ('Hari Nasional Mengenang', 'NP'),
 ('Mereka', 'PR'),
 ('yang', 'SC'),
 ('Gugur', 'NNP'),
 (',', 'SYM'),
 ('red', 'FW'),
 (')', 'SYM'),
 ('di', 'IN'),
 ('alun-alun', 'NN'),
 ('Dam', 'NNP'),
 (',', 'SYM'),
 ('Amsterdam', 'NNP'),
 (',', 'SYM'),
 ('akan', 'ADV'),
 ('diselenggarakan', 'VP'),
 ('dengan', 'IN'),
 ('format', 'NN'),
 ('tidak', 'NEG'),
 ('seperti', 'IN'),
 ('biasa', 'ADJP'),
 ('.', 'SYM')]

In [95]:
postag_svc.get_phrase_tag("Salah satu pasien positif corona di Yogyakarta Iwan Dwiprahasto meninggal dunia pada Selasa (24/3/2020)")

[('Salah', 'ADJP'),
 ('satu', 'NUM'),
 ('pasien positif', 'NP'),
 ('corona', 'FW'),
 ('di', 'IN'),
 ('Yogyakarta Iwan Dwiprahasto', 'NP'),
 ('meninggal', 'VP'),
 ('dunia', 'NN'),
 ('pada', 'IN'),
 ('Selasa', 'NNP'),
 ('(', 'SYM'),
 ('24/3/2020', 'NUM'),
 (')', 'SYM')]

In [89]:
postag_svc.get_phrase_tag("Pegawai Terindikasi Corona, Bank Mandiri Setop Operasional Cabang Kyai Tapa")

[('Pegawai Terindikasi Corona', 'NP'),
 (',', 'SYM'),
 ('Bank Mandiri Setop Operasional Cabang Kyai Tapa', 'NP')]