In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 3.8MB/s 
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


In [3]:
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [4]:
#Train conllu file
data = pd.read_csv('/content/drive/My Drive/Assignment 4/hi-ud-train.conllu') 
data = data.dropna()
data = data.reset_index()

In [5]:
dataF = list()
data_in = list()
for i in range(len(data)):
  a = str(data['ID'][i])
  b = data['WORD'][i]
  c = data['POS_TAG'][i]
  data_in.append((a,b,c))
  if b=='.':
    dataF.append(data_in)
    data_in = []

In [6]:
#Test conllu file
file1 = open('/content/drive/My Drive/Assignment 4/hi-ud-test .conllu', 'r')  
Lines = file1.readlines() 
count = 0
dataG = list()
data_in=list()
for line in Lines:
  a = tuple((line.strip()).split('\t'))
  #print(a)
  if a[0]!='':
    data_in.append(a) 
  if a[0]!='' and a[1] == '.':
    dataG.append(data_in)
    data_in = []

In [7]:
train_sents = dataF
test_sents = dataG

In [8]:
def word2features(sent, i):
    word = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-4:]': word[-4:],
        'word[-3:]': word[-3:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),    
    }
    if i > 0:
        word1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.len()': len(word1),
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][1]    
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(), 
            '+1:word.len()': len(word1),
        })
    else:
        features['EOS'] = True
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for id,token,label in sent]

def sent2tokens(sent):
    return [token for id,token,label in sent]

In [9]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [10]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 1.55 s, sys: 1.52 ms, total: 1.55 s
Wall time: 1.56 s


In [11]:
labels = list(crf.classes_)

In [12]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

  average, "true nor predicted", 'F-score is', len(true_sum)


0.8372947209173379

In [13]:
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

           X      0.000     0.000     0.000         0
        PART      0.970     0.970     0.970        33
       CCONJ      1.000     1.000     1.000        25
       SCONJ      0.667     0.667     0.667         3
         ADJ      0.612     0.755     0.676        94
         ADP      0.962     0.977     0.970       309
         ADV      0.667     0.476     0.556        21
        VERB      0.818     0.818     0.818        99
         DET      0.800     0.889     0.842        36
        NOUN      0.765     0.842     0.802       329
        PRON      0.841     0.815     0.828        65
       PROPN      0.638     0.510     0.567       145
         NUM      1.000     0.880     0.936        25
       PUNCT      1.000     0.830     0.907       135
         AUX      0.935     0.935     0.935       139

   micro avg      0.838     0.839     0.839      1458
   macro avg      0.778     0.758     0.765      1458
weighted avg      0.841   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
correct_pred = 0
allPred = 0
for i in range(len(y_pred)):
  for j in range(len(y_pred[i])):
    if(y_pred[i][j]==y_test[i][j]):
      correct_pred = correct_pred+1
    allPred = allPred+1
print("Overall test accuracy : ",correct_pred/allPred)

Overall test accuracy :  0.8382453735435229


In [15]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top 10 likely transitions:")
print()
print_transitions(Counter(crf.transition_features_).most_common(10))
print('____________________________')

print("\nTop 10 unlikely transitions:")
print()
print_transitions(Counter(crf.transition_features_).most_common()[-10:])

Top 10 likely transitions:

VERB   -> AUX     3.870462
PROPN  -> PROPN   2.877369
AUX    -> AUX     2.106064
ADJ    -> NOUN    1.987040
PROPN  -> ADP     1.962522
VERB   -> SCONJ   1.812537
AUX    -> SCONJ   1.806367
NOUN   -> ADP     1.500518
NUM    -> NOUN    1.500236
PRON   -> ADP     1.495635
____________________________

Top 10 unlikely transitions:

PROPN  -> DET     -1.248055
NUM    -> PRON    -1.295098
ADP    -> AUX     -1.321371
ADP    -> CCONJ   -1.333775
ADV    -> AUX     -1.337072
AUX    -> ADP     -1.664858
CCONJ  -> AUX     -1.680747
ADJ    -> PRON    -1.859629
ADJ    -> ADP     -2.248199
DET    -> ADP     -2.360762


If we consider that the train accuracy is to be found on the basis of the CRF being fit on the CRF trained on the train set.


In [16]:
y_pred = crf.predict(X_train)
print("F1 Score",metrics.flat_f1_score(y_train, y_pred, 
                      average='weighted', labels=labels))

F1 Score 0.996529919353909


In [17]:
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_train, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

           X      1.000     1.000     1.000         2
        PART      1.000     1.000     1.000       163
       CCONJ      0.993     1.000     0.997       150
       SCONJ      0.984     1.000     0.992        61
         ADJ      0.998     1.000     0.999       570
         ADP      0.998     0.999     0.998      1387
         ADV      0.982     0.982     0.982       111
        VERB      0.998     0.980     0.989       640
         DET      0.996     0.996     0.996       231
        NOUN      0.998     0.998     0.998      1597
        PRON      0.998     0.995     0.997       431
       PROPN      1.000     0.996     0.998       708
         NUM      1.000     1.000     1.000       152
       PUNCT      1.000     1.000     1.000       564
         AUX      0.984     1.000     0.992       730

    accuracy                          0.997      7497
   macro avg      0.995     0.996     0.996      7497
weighted avg      0.997   

In [18]:
correct_pred = 0
allPred = 0
for i in range(len(y_pred)):
  for j in range(len(y_pred[i])):
    if(y_pred[i][j]==y_train[i][j]):
      correct_pred = correct_pred+1
    allPred = allPred+1
print("Overall train accuracy : ",correct_pred/allPred)

Overall train accuracy :  0.9965319461117781
