In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [2]:
import nltk
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from datasets import load_dataset
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Load the stop words for English
stop_words = set(stopwords.words('english'))

# Load the CoNLL 2003 dataset
dataset = load_dataset('conll2003')

# Sample dataset structure
# Each entry in dataset['train'] or dataset['test'] looks like:
# {
#   'id': '0',
#   'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
#   'ner_tags': [3, 0, 3, 0, 0, 0, 3, 0, 0],  # NER tag integers
#   'pos_tags': [22, 23, 20, 23, 21, 23, 20, 23, 23],  # POS tag integers
#   'chunk_tags': [11, 10, 11, 10, 10, 10, 11, 10, 10]
# }

# Load the train/test sets
train_data = dataset['train'].shuffle(seed=42).select(range(int(len(dataset['train']) * 0.50)))
test_data = dataset['test'].shuffle(seed=42).select(range(int(len(dataset['test']) * 0.50)))

# Get NER tag mappings from integers to label strings
label_names = dataset['train'].features['ner_tags'].feature.names

# Convert NER tags into readable labels
def convert_tags_to_labels(tag_list):
    return [label_names[tag] for tag in tag_list]

# Feature extraction function for each word
def extract_features(sentence, index):
    word = sentence[index]

    # Check if word is a stop word
    if word.lower() in stop_words:
        return None

    features = {
        'word': word,
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': word[0].isupper(),
        'is_all_caps': word.isupper(),
        'is_all_lower': word.islower(),
        'prefix-1': word[0],
        'prefix-2': word[:2],
        'prefix-3': word[:3],
        'suffix-1': word[-1],
        'suffix-2': word[-2:],
        'suffix-3': word[-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'is_numeric': word.isdigit(),
        'capitals_inside': word[1:].lower() != word[1:]
    }
    return features

# Prepare dataset for feature extraction
def transform_to_dataset(dataset):
    X, y = [], []
    for i in range(len(dataset)):
        tokens = dataset[i]['tokens']
        tags = convert_tags_to_labels(dataset[i]['ner_tags'])
        for index in range(len(tokens)):
            features = extract_features(tokens, index)
            if features:  # Skip stop words
                X.append(features)
                y.append(tags[index])
    return X, y

# Transform the train and test datasets
X_train, y_train = transform_to_dataset(train_data)
X_test, y_test = transform_to_dataset(test_data)

# Convert features to numerical vectors
vec = DictVectorizer(sparse=True)
X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)

# Initialize and train the SVM
clf = LinearSVC()
clf.fit(X_train, y_train)

# Predict the NER labels for the test set
y_pred = clf.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

              precision    recall  f1-score   support

       B-LOC       0.78      0.82      0.80       808
      B-MISC       0.73      0.79      0.76       341
       B-ORG       0.74      0.70      0.72       844
       B-PER       0.78      0.78      0.78       808
       I-LOC       0.72      0.65      0.68       125
      I-MISC       0.61      0.70      0.65        98
       I-ORG       0.69      0.67      0.68       383
       I-PER       0.81      0.85      0.83       580
           O       0.99      0.98      0.98     13497

    accuracy                           0.93     17484
   macro avg       0.76      0.77      0.76     17484
weighted avg       0.93      0.93      0.93     17484



In [3]:
import nltk
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from datasets import load_dataset
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Load the stop words for English
stop_words = set(stopwords.words('english'))

# Load the CoNLL 2003 dataset
dataset = load_dataset('conll2003')

# Load a subset of the train/test sets for faster execution
train_data = dataset['train'].shuffle(seed=42).select(range(int(len(dataset['train']) * 0.50)))
test_data = dataset['test'].shuffle(seed=42).select(range(int(len(dataset['test']) * 0.50)))

# Get NER tag mappings from integers to label strings
label_names = dataset['train'].features['ner_tags'].feature.names

# Convert NER tags into readable labels
def convert_tags_to_labels(tag_list):
    return [label_names[tag] for tag in tag_list]

# Feature extraction function for each word
def extract_features(sentence, index):
    word = sentence[index]

    # Check if word is a stop word
    if word.lower() in stop_words:
        return None

    features = {
        'word': word,
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': word[0].isupper(),
        'is_all_caps': word.isupper(),
        'is_all_lower': word.islower(),
        'prefix-1': word[0],
        'prefix-2': word[:2],
        'prefix-3': word[:3],
        'suffix-1': word[-1],
        'suffix-2': word[-2:],
        'suffix-3': word[-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'is_numeric': word.isdigit(),
        'capitals_inside': word[1:].lower() != word[1:]
    }
    return features

# Prepare dataset for feature extraction
def transform_to_dataset(dataset):
    X, y = [], []
    for i in range(len(dataset)):
        tokens = dataset[i]['tokens']
        tags = convert_tags_to_labels(dataset[i]['ner_tags'])
        for index in range(len(tokens)):
            features = extract_features(tokens, index)
            if features:  # Skip stop words
                X.append(features)
                y.append(tags[index])
    return X, y

# Transform the train and test datasets
X_train, y_train = transform_to_dataset(train_data)
X_test, y_test = transform_to_dataset(test_data)

# Convert features to numerical vectors
vec = DictVectorizer(sparse=True)
X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)

# Initialize and train SVM with polynomial kernel
clf_poly = SVC(kernel='poly', degree=3)  # Polynomial kernel with degree 3
clf_poly.fit(X_train, y_train)

# Predict NER labels using polynomial kernel SVM
y_pred_poly = clf_poly.predict(X_test)

# Print classification report for polynomial kernel SVM
print("Classification report for SVM with Polynomial Kernel:")
print(classification_report(y_test, y_pred_poly))

# Initialize and train SVM with RBF kernel
clf_rbf = SVC(kernel='rbf')  # RBF kernel
clf_rbf.fit(X_train, y_train)

# Predict NER labels using RBF kernel SVM
y_pred_rbf = clf_rbf.predict(X_test)

# Print classification report for RBF kernel SVM
print("\nClassification report for SVM with RBF Kernel:")
print(classification_report(y_test, y_pred_rbf))


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Classification report for SVM with Polynomial Kernel:
              precision    recall  f1-score   support

       B-LOC       0.82      0.83      0.83       808
      B-MISC       0.79      0.71      0.75       341
       B-ORG       0.74      0.63      0.68       844
       B-PER       0.64      0.82      0.72       808
       I-LOC       0.80      0.42      0.55       125
      I-MISC       0.72      0.63      0.67        98
       I-ORG       0.64      0.58      0.60       383
       I-PER       0.75      0.68      0.72       580
           O       0.98      0.99      0.98     13497

    accuracy                           0.92     17484
   macro avg       0.77      0.70      0.72     17484
weighted avg       0.92      0.92      0.92     17484


Classification report for SVM with RBF Kernel:
              precision    recall  f1-score   support

       B-LOC       0.77      0.84      0.80       808
      B-MISC       0.76      0.68      0.72       341
       B-ORG       0.70      0

In [None]:
from sklearn.metrics import classification_report, precision_recall_fscore_support
import pandas as pd

# Helper function to get true positives, false positives, and false negatives for linear
def get_detailed_results(tokens, gold_labels, pred_labels):
    results = []

    for token, true_label, predicted_label in zip(tokens, gold_labels, pred_labels):
        if true_label == predicted_label:
            if true_label != 'O':  # O is typically used for non-entity tokens
                results.append((token, true_label, predicted_label, 'True Positive'))
            else:
                results.append((token, true_label, predicted_label, 'True Negative'))
        else:
            if predicted_label != 'O' and true_label != 'O':  # Both true and predicted are entities
                results.append((token, true_label, predicted_label, 'False Positive'))
            elif predicted_label == 'O' and true_label != 'O':  # Missed entity (should be tagged but wasn't)
                results.append((token, true_label, predicted_label, 'False Negative'))
            elif predicted_label != 'O' and true_label == 'O':  # Incorrectly tagged as an entity
                results.append((token, true_label, predicted_label, 'False Positive'))
            else:
                results.append((token, true_label, predicted_label, 'True Negative'))

    return results

# Prepare tokens and labels from test set for analysis, taking stop words into account
test_tokens = []
y_test_flat = []
for sample in test_data:
    tokens = sample['tokens']
    tags = convert_tags_to_labels(sample['ner_tags'])
    for index in range(len(tokens)):
        features = extract_features(tokens, index)
        if features:  # Only include tokens that are not skipped as stop words
            test_tokens.append(tokens[index])
            y_test_flat.append(tags[index])

# y_pred_flat should be the same as y_pred since it's already aligned with the feature-extracted data
y_pred_flat = list(y_pred)

# Ensure both lists (y_test_flat and y_pred_flat) are of equal length
# The assertion is now removed as we ensured the lists have the same length by
# only including tokens that were used for prediction in both y_test_flat and y_pred_flat
#assert len(y_test_flat) == len(y_pred_flat), f"Length mismatch: y_test_flat ({len(y_test_flat)}) vs y_pred_flat ({len(y_pred_flat)})"


# Get detailed results
detailed_results = get_detailed_results(test_tokens, y_test_flat, y_pred_flat)

# Convert to DataFrame for easier viewing
df_results = pd.DataFrame(detailed_results, columns=['Token', 'Gold Label', 'Predicted Label', 'Result'])
print(df_results)

# Display a summary of true positives, false positives, and false negatives
print("Summary:")
print(df_results['Result'].value_counts())

# Optionally, you can save the detailed result to a CSV file
# df_results.to_csv

          Token Gold Label Predicted Label         Result
0      Hartford      B-ORG           B-ORG  True Positive
1             4          O               O  True Negative
2        BOSTON      B-ORG           B-ORG  True Positive
3             2          O               O  True Negative
4            S.      B-PER           B-PER  True Positive
...         ...        ...             ...            ...
17479         (          O               O  True Negative
17480     NYMEX      B-ORG           B-ORG  True Positive
17481   premium          O               O  True Negative
17482         )          O               O  True Negative
17483         .          O               O  True Negative

[17484 rows x 4 columns]
Summary:
Result
True Negative     13265
True Positive      3053
False Positive      981
False Negative      185
Name: count, dtype: int64
