In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from datasets import load_dataset
import nltk

# Download necessary NLTK data
nltk.download('averaged_perceptron_tagger')

# Load the CoNLL 2003 dataset
dataset = load_dataset('conll2003')

# Reduce the dataset size further (e.g., use only 25% of the training data)
train_data = dataset['train'].shuffle(seed=42).select(range(int(len(dataset['train']) * 0.50)))
test_data = dataset['test'].shuffle(seed=42).select(range(int(len(dataset['test']) * 0.50)))

# Get NER tag mappings from integers to label strings
label_names = dataset['train'].features['ner_tags'].feature.names

# Convert NER tags into readable labels
def convert_tags_to_labels(tag_list):
    return [label_names[tag] for tag in tag_list]

# Feature extraction function for each word
def extract_features(sentence, index):
    word = sentence[index]
    features = {
        'word': word,
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': word[0].isupper(),
        'is_all_caps': word.isupper(),
        'is_all_lower': word.islower(),
        'prefix-1': word[0],
        'prefix-2': word[:2],
        'prefix-3': word[:3],
        'suffix-1': word[-1],
        'suffix-2': word[-2:],
        'suffix-3': word[-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'is_numeric': word.isdigit(),
        'capitals_inside': word[1:].lower() != word[1:]
    }
    return features

# Prepare dataset for feature extraction
def transform_to_dataset(dataset):
    X, y = [], []
    for i in range(len(dataset)):
        tokens = dataset[i]['tokens']
        tags = convert_tags_to_labels(dataset[i]['ner_tags'])
        for index in range(len(tokens)):
            X.append(extract_features(tokens, index))
            y.append(tags[index])
    return X, y

# Transform the train and test datasets
X_train, y_train = transform_to_dataset(train_data)
X_test, y_test = transform_to_dataset(test_data)

# Convert features to numerical vectors with sparse matrix
vec = DictVectorizer(sparse=True)
X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)

# Initialize and train the k-NN classifier with fewer neighbors for speed
clf = KNeighborsClassifier(n_neighbors=2)  # Adjust the number of neighbors
clf.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

              precision    recall  f1-score   support

       B-LOC       0.57      0.87      0.69       404
      B-MISC       0.48      0.72      0.57       162
       B-ORG       0.49      0.50      0.50       418
       B-PER       0.62      0.54      0.58       424
       I-LOC       0.44      0.48      0.46        64
      I-MISC       0.50      0.63      0.56        67
       I-ORG       0.27      0.39      0.32       184
       I-PER       0.66      0.30      0.41       298
           O       0.99      0.97      0.98      9438

    accuracy                           0.90     11459
   macro avg       0.56      0.60      0.56     11459
weighted avg       0.91      0.90      0.90     11459

