In [8]:
import spacy
from sklearn.feature_extraction import DictVectorizer
import sklearn.svm

nlp = spacy.load("en_core_web_sm")

def read_conll2003(f_name):
    current_sentence = []
    with open(f_name) as f:
        for line in f:
            line = line.strip()
            if line.startswith("-DOCSTART-"):
                continue
            if not line:
                if current_sentence:
                    yield current_sentence
                    current_sentence = []
                continue
            columns = line.split()
            assert len(columns) == 4
            current_sentence.append((columns[0], columns[3]))
        else:
            if current_sentence:
                yield current_sentence

def generate_sentence_features(sent):
    sent_features = []
    for word_idx, (word, ner_label) in enumerate(sent):
        word_features = {"word_" + word: 1}
        if word_idx != 0:
            word_features["left_word_" + sent[word_idx - 1][0]] = 1
        if word_idx != len(sent) - 1:
            word_features["right_word_" + sent[word_idx + 1][0]] = 1
        sent_features.append(word_features)
    return sent_features

def prep_data(sentences):
    all_labels = []
    all_features = []
    for sentence in sentences:
        sent_features = generate_sentence_features(sentence)
        assert len(sent_features) == len(sentence)
        for (word, ner_label), features in zip(sentence, sent_features):
            all_labels.append(ner_label)
            all_features.append(features)
    return all_labels, all_features


train_data_path = "/content/train.txt"

valid_data_path = "/content/valid.txt"


sentences_train = list(read_conll2003(train_data_path))
sentences_dev = list(read_conll2003(valid_data_path))


train_labels, train_features = prep_data(sentences_train)
dev_labels, dev_features = prep_data(sentences_dev)


vectorizer = DictVectorizer()
vectorizer.fit(train_features)
feature_vectors_train = vectorizer.transform(train_features)
feature_vectors_dev = vectorizer.transform(dev_features)


classifier = sklearn.svm.LinearSVC(C=1, verbose=1)
classifier.fit(feature_vectors_train, train_labels)


accuracy = classifier.score(feature_vectors_dev, dev_labels)
print("Classifier Accuracy:", accuracy)


test_sentence = "Apple was founded by Steve Jobs in California ."
test_doc = nlp(test_sentence)
test_features = generate_sentence_features([(token.text, "XXX") for token in test_doc])
test_vector = vectorizer.transform(test_features)
test_predictions = classifier.predict(test_vector)

print("Test Sentence Predictions:")
for token, label in zip(test_doc, test_predictions):
    print(token.text, label)


[LibLinear]Classifier Accuracy: 0.9578482146333865
Test Sentence Predictions:
Apple B-ORG
was O
founded O
by O
Steve B-PER
Jobs I-PER
in O
California B-LOC
. O


# New Section

In [4]:
!python -m spacy download en_core_web_sm




2023-11-10 09:54:24.298364: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-10 09:54:24.298462: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-10 09:54:24.298501: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-10 09:54:24.309336: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-sm==3.6.0
  Downloading htt

In [1]:
pip install spacy


