In [3]:
!pip install datasets



In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from datasets import load_dataset
import nltk

# Download necessary NLTK data
nltk.download('averaged_perceptron_tagger')

# Load the CoNLL 2003 dataset
dataset = load_dataset('conll2003')

# Reduce the dataset size further (e.g., use only 25% of the training data)
train_data = dataset['train'].shuffle(seed=42).select(range(int(len(dataset['train']) * 0.50)))
test_data = dataset['test'].shuffle(seed=42).select(range(int(len(dataset['test']) * 0.50)))

# Get NER tag mappings from integers to label strings
label_names = dataset['train'].features['ner_tags'].feature.names

# Convert NER tags into readable labels
def convert_tags_to_labels(tag_list):
    return [label_names[tag] for tag in tag_list]

# Feature extraction function for each word
def extract_features(sentence, index):
    word = sentence[index]
    features = {
        'word': word,
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': word[0].isupper(),
        'is_all_caps': word.isupper(),
        'is_all_lower': word.islower(),
        'prefix-1': word[0],
        'prefix-2': word[:2],
        'prefix-3': word[:3],
        'suffix-1': word[-1],
        'suffix-2': word[-2:],
        'suffix-3': word[-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'is_numeric': word.isdigit(),
        'capitals_inside': word[1:].lower() != word[1:]
    }
    return features

# Prepare dataset for feature extraction
def transform_to_dataset(dataset):
    X, y = [], []
    for i in range(len(dataset)):
        tokens = dataset[i]['tokens']
        tags = convert_tags_to_labels(dataset[i]['ner_tags'])
        for index in range(len(tokens)):
            X.append(extract_features(tokens, index))
            y.append(tags[index])
    return X, y

# Transform the train and test datasets
X_train, y_train = transform_to_dataset(train_data)
X_test, y_test = transform_to_dataset(test_data)

# Convert features to numerical vectors with sparse matrix
vec = DictVectorizer(sparse=True)
X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)

# Initialize and train the Decision Tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


              precision    recall  f1-score   support

       B-LOC       0.71      0.69      0.70       809
      B-MISC       0.63      0.69      0.66       341
       B-ORG       0.60      0.52      0.56       848
       B-PER       0.60      0.62      0.61       810
       I-LOC       0.56      0.44      0.50       126
      I-MISC       0.60      0.64      0.62       102
       I-ORG       0.52      0.52      0.52       414
       I-PER       0.66      0.72      0.68       581
           O       0.98      0.99      0.98     19021

    accuracy                           0.92     23052
   macro avg       0.65      0.65      0.65     23052
weighted avg       0.92      0.92      0.92     23052



In [5]:
from sklearn.metrics import classification_report, precision_recall_fscore_support
import pandas as pd

# Helper function to get true positives, false positives, and false negatives
def get_detailed_results(tokens, gold_labels, pred_labels):
    results = []

    for token, true_label, predicted_label in zip(tokens, gold_labels, pred_labels):
        if true_label == predicted_label:
            if true_label != 'O':  # O is typically used for non-entity tokens
                results.append((token, true_label, predicted_label, 'True Positive'))
            else:
                results.append((token, true_label, predicted_label, 'True Negative'))
        else:
            if predicted_label != 'O' and true_label != 'O':  # Both true and predicted are entities
                results.append((token, true_label, predicted_label, 'False Positive'))
            elif predicted_label == 'O' and true_label != 'O':  # Missed entity (should be tagged but wasn't)
                results.append((token, true_label, predicted_label, 'False Negative'))
            elif predicted_label != 'O' and true_label == 'O':  # Incorrectly tagged as an entity
                results.append((token, true_label, predicted_label, 'False Positive'))
            else:
                results.append((token, true_label, predicted_label, 'True Negative'))

    return results

# Prepare tokens from test set for analysis
test_tokens = [token for sample in test_data for token in sample['tokens']]

# Convert the predicted and true labels into human-readable format
y_test_readable = [convert_tags_to_labels(sample['ner_tags']) for sample in test_data]
y_test_flat = [label for sublist in y_test_readable for label in sublist]
y_pred_flat = list(y_pred)

# Ensure both lists (y_test_flat and y_pred_flat) are of equal length
assert len(y_test_flat) == len(y_pred_flat)

# Get detailed results
detailed_results = get_detailed_results(test_tokens, y_test_flat, y_pred_flat)

# Convert to DataFrame for easier viewing
df_results = pd.DataFrame(detailed_results, columns=['Token', 'Gold Label', 'Predicted Label', 'Result'])
print(df_results)

# Display a summary of true positives, false positives, and false negatives
print("Summary:")
print(df_results['Result'].value_counts())

# Optionally, you can save the detailed result to a CSV file
# df_results.to_csv('ner_detailed_results.csv', index=False)


          Token Gold Label Predicted Label          Result
0      Hartford      B-ORG           B-ORG   True Positive
1             4          O               O   True Negative
2        BOSTON      B-ORG           B-ORG   True Positive
3             2          O               O   True Negative
4            S.      B-PER           B-PER   True Positive
...         ...        ...             ...             ...
23047         (          O               O   True Negative
23048     NYMEX      B-ORG               O  False Negative
23049   premium          O               O   True Negative
23050         )          O               O   True Negative
23051         .          O               O   True Negative

[23052 rows x 4 columns]
Summary:
Result
True Negative     18744
True Positive      2491
False Positive     1503
False Negative      314
Name: count, dtype: int64
