<a href="https://colab.research.google.com/github/Sourja99/MyProjects/blob/main/sequence_labelling_codeonly.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from collections import namedtuple
import sklearn.svm
from sklearn.feature_extraction import DictVectorizer


#Same as tuple but the fields are named for convenience
#this says we have four fields
OneWord=namedtuple("OneWord",["word","pos_label","chunk_label","entity_label"])

def read_conll2003(f_name):
    """Yield complete sentences"""
    current_sentence=[] #This will be a list of (word,label), which we accumulate for each sentence
    with open(f_name) as f:
        for line in f:
            line=line.strip() #drop whitespace
            if line.startswith("-DOCSTART-"): #let's not worry about these for the time being
                continue
            if not line: #sentence break
                if current_sentence: #if we gathered a sentence, we should yield it, because a new starts
                    yield current_sentence #much like return, but continues past this line once the element has been consumed
                    current_sentence=[] #...and start a new one
                continue
            #if we made it here, we are on a normal line
            columns=line.split() #an actual word line
            assert len(columns)==4 #we should have four columns, looking at the data
            current_sentence.append(OneWord(*columns))
        else: #for ... else -> the else part is executed once, when "for" runs out of elements
            if current_sentence: #yield also the last one!
                yield current_sentence
                
def generate_sentence_features(sent):
    #Given a sentence as a list of (word, label) pairs
    #generate the features for every word
    #The result should be a list of same length as the sentence
    #Each item is a dictionary of {"feature name"->feature value} mappings, holding all features of the word at that position
    
    sent_features=[] #this will be the result
    for word_idx, one_word in enumerate(sent):
        #We do nothing with label
        #it just happens to be around
        word_features={}
        word_features["word_"+one_word.word]=1 #the word itself is a feature
        if word_idx!=0:
            word_features["left_word_"+sent[word_idx-1].word]=1
        if word_idx!=len(sent)-1:
            word_features["right_word_"+sent[word_idx+1].word]=1
        sent_features.append(word_features)
    return sent_features

def prep_data(sentences):
    all_labels=[] #here we gather labels for all words in all sentences
    all_features=[] #here we gather features for all words in all sentences
    for sentence in sentences:
        sent_features=generate_sentence_features(sentence)
        assert len(sent_features)==len(sentence)
        #Now we can get, for every position its label and its features
        for one_word,features in zip(sentence,sent_features):
            all_labels.append(one_word.pos_label) #label
            all_features.append(features)         #and features to go with it
    return all_labels, all_features


#1) Read data in
print("Read the data in")
sentences_train=list(read_conll2003("CONLL2003/train.txt"))
sentences_dev=list(read_conll2003("CONLL2003/valid.txt"))

#2) Generate the features and labels
print("Generate features")
train_labels,train_features=prep_data(sentences_train)
dev_labels,dev_features=prep_data(sentences_dev)

#3) Prepare the feature vectors for the classifier
print("Generate feature vectors")
vectorizer=DictVectorizer()
vectorizer.fit(train_features)
feature_vectors_train=vectorizer.transform(train_features)
feature_vectors_dev=vectorizer.transform(dev_features)

#4) Train the classifier
print("Train the classifier (takes a while for C=0.5 and higher)")
classifier=sklearn.svm.LinearSVC(C=0.5,verbose=1)
classifier.fit(feature_vectors_train, train_labels)

#5) Evaluate
print("Evaluate")
classifier.score(feature_vectors_dev,dev_labels)


Read the data in


FileNotFoundError: ignored