## Implementation of Conditional Random Field

## Resources



https://github.com/lancifollia/crf
https://homepages.inf.ed.ac.uk/csutton/publications/crftut-fnt.pdf

In [1]:
import pandas as pd
from collections import Counter

## Loading the Corpus

### The corpus comes from [CoNLL 2000 Chunking Data] (https://www.clips.uantwerpen.be/conll2000/chunking/). 

First line example: 

Confidence NN B-NP

The first column contains the current word, the second its part-of-speech (POS) tag as derived by the Brill tagger and the third its chunk tag 

For each paragraph there is an empty line seperating the paragraphs

Now we will write a function to seperate the words (word and POS) from the labels/y (chunk tag)


In [2]:
def read_corpus(f1):
    """ Read the corpus file and append to lists.
    
    INPUT: corpus file
    Output:
    X - list of paragraphs of all words besides label
    Y - list of labels for each line in paragraph
    d - list of tuples corresponding to X,Y from each paragraph
         first paragraph X (word/pos) is \n " , d[0][0])
         first paragraph Y is \n " , d[0][1])
    
    """
    X = list()
    Y = list()
    d = list()
    
    with open(f1,"r") as infile:
        for line in infile:
            line = line.strip().split()
            if len(line) > 0:
                X.append(line[:-1])
                Y.append(line[-1])
            if len(line) == 0:  
                if len(X) > 0:
                    d.append((X,Y)) # Append the X and Y to data if you go to new paragraph
                    X = list()
                    Y = list()      # Re-initialize list 
    d.append((X,Y)) # Append the last X and Y to data if you go to new paragraph
    
    return d

In [3]:
d = read_corpus("data/chunking_small/smallest_train.data")
print("list of first paragraph X (word/pos) is \n " , d[0][0])
print("list of first paragraph Y is \n " , d[0][1])

print("list of 2nd paragraph X (word/pos) is \n " , d[1][0])
print("list of 2nd paragraph Y is \n " , d[1][1])

list of first paragraph X (word/pos) is 
  [['Confidence', 'NN'], ['in', 'IN'], ['the', 'DT'], ['pound', 'NN'], ['is', 'VBZ'], ['widely', 'RB'], ['expected', 'VBN'], ['to', 'TO'], ['take', 'VB'], ['another', 'DT'], ['sharp', 'JJ'], ['dive', 'NN'], ['if', 'IN'], ['trade', 'NN'], ['figures', 'NNS'], ['for', 'IN'], ['September', 'NNP'], [',', ','], ['due', 'JJ'], ['for', 'IN'], ['release', 'NN'], ['tomorrow', 'NN'], [',', ','], ['fail', 'VB'], ['to', 'TO'], ['show', 'VB'], ['a', 'DT'], ['substantial', 'JJ'], ['improvement', 'NN'], ['from', 'IN'], ['July', 'NNP'], ['and', 'CC'], ['August', 'NNP'], ["'s", 'POS'], ['near-record', 'JJ'], ['deficits', 'NNS'], ['.', '.']]
list of first paragraph Y is 
  ['B-NP', 'B-PP', 'B-NP', 'I-NP', 'B-VP', 'I-VP', 'I-VP', 'I-VP', 'I-VP', 'B-NP', 'I-NP', 'I-NP', 'B-SBAR', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'O', 'B-ADJP', 'B-PP', 'B-NP', 'B-NP', 'O', 'B-VP', 'I-VP', 'I-VP', 'B-NP', 'I-NP', 'I-NP', 'B-PP', 'B-NP', 'I-NP', 'I-NP', 'B-NP', 'I-NP', 'I-NP', 'O']
list

## Get features

### initialize model

In [4]:
class FeatureSet():
    
    feature_dic = dict()
    observation_set = set()
    empirical_counts = Counter()
    num_features = 0

    label_dic = {"*": 0} # Initialize dictionaries with filler starting values
    label_array = ["*"]
    prev_y = 0         # Initialize starting index as 0
    features = []
    training_feature_data = []
    
    d = read_corpus("data/chunking_small/smallest_train.data")
    
    
    def __init__(self):
        pass
    
    def get_labels(self, Y, t):     
        """
        INPUT - 
            X = list of paragraphs X (word/pos)
            Y = list of paragraphs Y (word/pos)
            t = time in paragraph
            label_dic = label dictionary with label then index    {'*': 0, 'B-NP': 1}
            label_array = label
        OUTPUT - 
        """            
        try:
            y = self.label_dic[Y[t]]   # Check to see if label is seen before
        except KeyError:
            y = len(self.label_dic)
            self.label_dic[Y[t]] = y    # put new number in for label
            self.label_array.append(Y[t]) # Append label name to array
        self.y = y
        
    def get_features(self, X,t):
        """
        INPUT - 
            X = list of paragraphs X (word/pos)
            t = time in paragraph
        OUTPUT - 
            features = list of features
                feature name: word or pos (space seperated)
        """   
        features = []

        length = len(X)
        #For current line
        features.append('U[0]:%s' % X[t][0])
        features.append('POS_U[0]:%s' % X[t][1])
    
        if t < length-1:
            features.append('U[+1]:%s' % (X[t+1][0]))
            features.append('B[0]:%s %s' % (X[t][0], X[t+1][0]))
            features.append('POS_U[+1]:%s' % X[t+1][1])
            features.append('POS_B[0]:%s %s' % (X[t][1], X[t+1][1]))
            # print('POS_B[0]:%s %s' % (X[t][1], X[t+1][1]))
            if t < length-2:
                features.append('U[+2]:%s' % (X[t+2][0]))
                features.append('POS_U[+2]:%s' % (X[t+2][1]))
                features.append('POS_B[+1]:%s %s' % (X[t+1][1], X[t+2][1]))
                features.append('POS_T[0]:%s %s %s' % (X[t][1], X[t+1][1], X[t+2][1]))
        if t > 0:
            features.append('U[-1]:%s' % (X[t-1][0]))
            features.append('B[-1]:%s %s' % (X[t-1][0], X[t][0]))
            features.append('POS_U[-1]:%s' % (X[t-1][1]))
            features.append('POS_B[-1]:%s %s' % (X[t-1][1], X[t][1]))
            if t < length-1:
                features.append('POS_T[-1]:%s %s %s' % (X[t-1][1], X[t][1], X[t+1][1]))
            if t > 1:
                features.append('U[-2]:%s' % (X[t-2][0]))
                features.append('POS_U[-2]:%s' % (X[t-2][1]))
                features.append('POS_B[-2]:%s %s' % (X[t-2][1], X[t-1][1]))
                features.append('POS_T[-2]:%s %s %s' % (X[t-2][1], X[t-1][1], X[t][1]))    
        return features
        

    def add_features(self, features):
        """
        INPUT - 
            features = list of features from current line
                feature name: word or pos (space seperated)
                       'U[0]:%s'
        OUTPUT -
            updated dictionaries
                feature_dic[f][(prev_y, y)]
                empirical_counts[ feature_dic[f][(prev_y, y)] ]
                
            total amount of features
                num_features
        """
        y = self.y
        prev_y = self.prev_y

        
        for f in features:

            if f in self.feature_dic.keys():
                
                #For prev_y y
                if (prev_y,y) in self.feature_dic[f].keys():
                    self.empirical_counts[ self.feature_dic[f][(prev_y, y)] ] += 1
                else:
                    self.feature_dic[f][(prev_y, y)] = self.num_features
                    self.empirical_counts[self.num_features] += 1
                    self.num_features += 1

                #For current y only  (-1,y)
                if (-1,y) in  self.feature_dic[f].keys():
                    self.empirical_counts[ self.feature_dic[f][(-1, y)] ] += 1
                else:
                    self.feature_dic[f][(-1, y)] = self.num_features
                    self.empirical_counts[self.num_features] += 1
                    self.num_features += 1

            # If you havent seen feature before 
            else:               
                self.feature_dic[f] = dict()                   # Create new dic for that feature

                #For prev y and y
                self.feature_dic[f][(prev_y, y)] = self.num_features
                self.empirical_counts[self.num_features] += 1
                self.num_features += 1

                # For current y only
                self.feature_dic[f][(-1, y)] = self.num_features
                self.empirical_counts[self.num_features] += 1
                self.num_features += 1
                      
        self.prev_y = y # when done adding set previous y   
        
        
    def get_feature_data(self, X, t, features):
        """
        INPUT - 
                feature_dic[f][(prev_y, y)]     Now filled in from previous loop
        OUTPUT -
                feature_list_dic[(prev_y, y)] = feature_id
                
                training_feature_data
                
                    [paragraph][sentence][word] dic of [prev y, y] or current[-1,y] with a index of where it is in features
                    Example from first timepoint
                    [((0, 1), {0, 2, 4, 6, 8, 10, 12, 14, 16, 18}), ((-1, 1), {1, 3, 5, 7, 9, 11, 13, 15, 17, 19}), 
                    ((1, 3), {1705, 90, 506, 1709}), ((-1, 3), {1706, 397, 399, 1429, 91}), ((3, 3), 
                    {1428, 396, 398, 391}), ((6, 1), {466}), ((2, 1), {1760, 1761, 1762, 719, 1757, 1758, 1759}),
                    ((1, 1), {749}), ((7, 8), {652}), ((-1, 8), {653}), ((7, 7), {1210, 1212, 1206, 1214}), 
                    ((-1, 7), {1215, 1211, 1213, 1207}), ((5, 5), {254}), ((-1, 5), {255}), ((4, 5), {864})]

        """
        feature_list_dic = dict()

        for f in features:
            # feature_dic[f][(prev_y, y)]
            for (prev_y, y), feature_id in self.feature_dic[f].items():  # get label and feature id
                if (prev_y, y) in feature_list_dic.keys():
                    feature_list_dic[(prev_y, y)].add(feature_id)        # 
                else:
                    feature_list_dic[(prev_y, y)] = {feature_id}

        l = [ ((prev_y, y), feature_ids) for (prev_y, y), feature_ids in feature_list_dic.items() ]
        return l
#         print(l)
#         print("*********")

        
        


In [5]:
F = FeatureSet()
for X, Y in F.d:
    for t in range(len(X)):
        F.get_labels(Y, t)
        features = F.get_features(X,t)
        F.add_features(features)

        


# # Get 
for X, Y in F.d:
    for t in range(len(X)):
        features = F.get_features(X,t)
        l = F.get_feature_data(X,t, features)
    F.training_feature_data.append(l)
        

In [16]:
print("Number of features are", F.num_features)
print("length of feature dic is", len(F.feature_dic))
print("Labels are :", F.label_dic)
print("Training feature data is .....")
#Training feature data is [paragraph][sentence][word] dic of [prev y, y] or current[-1,y] with a index of where it is in features
for l in F.training_feature_data[0][:][:]:
    print(l)  

Number of features are 1856
length of feature dic is 750
Labels are : {'*': 0, 'B-NP': 1, 'B-PP': 2, 'I-NP': 3, 'B-VP': 4, 'I-VP': 5, 'B-SBAR': 6, 'O': 7, 'B-ADJP': 8}
Training feature data is .....
((3, 7), {1184, 1186, 1188, 1190, 1192, 1194, 1196, 1178, 1180, 1182})
((-1, 7), {1185, 1187, 1189, 1191, 1193, 1195, 1197, 1179, 1181, 1183})
((3, 2), {971, 556})
((-1, 2), {972, 557})
((3, 6), {458})
((-1, 6), {459})
((2, 1), {741})
((-1, 1), {742})
((3, 4), {1622})
((-1, 4), {1623})


In [None]:
#Now create feature data

def get_feature_data(self, X, t, features):
    """
    INPUT - 
            feature_dic[f][(prev_y, y)]     Now filled in from previous loop
    OUTPUT -
            feature_list_dic[(prev_y, y)] = feature_id

    """
    feature_list_dic = dict()
    
    for f in features:
        # feature_dic[f][(prev_y, y)]

        for (prev_y, y), feature_id in self.feature_dic[f].items():  # get label and feature id
            labels = (prev_y, y)
            if labels in feature_list_dic.keys():
                feature_list_dic[labels].add(feature_id)        
            else:
                feature_list_dic[labels] = {feature_id}
                           
    l = [ (labels, feature_ids) for labels, feature_ids in feature_list_dic.items() ]
    return l
        
    def _get_training_feature_data(self):
        return [ [ self.get_feature_list(X, t) for t in range(len(X))  ] for X, _ in self.d]
        
        
        
        
def get_feature_list(self, X, t):
    #                 feature_dic[f][(prev_y, y)]
    feature_list_dic = dict()
    for feature_string in self.feature_func(X, t):
        for (prev_y, y), feature_id in self.feature_dic[feature_string].items():
            if (prev_y, y) in feature_list_dic.keys():
                feature_list_dic[(prev_y, y)].add(feature_id)
            else:
                feature_list_dic[(prev_y, y)] = {feature_id}

    return [ ((prev_y, y), feature_ids) for (prev_y, y), feature_ids in feature_list_dic.items() ]


    
    training_feature_data = self._get_training_feature_data()

## Train the model

### Initialize the model with the feature data