In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
import nltk
from nltk.tokenize import word_tokenize
from datasets import load_dataset

# Download NLTK tokenizers
nltk.download('punkt')

def load_data_from_huggingface():
    """Load the CoNLL-2003 dataset using Hugging Face datasets library."""
    dataset = load_dataset("conll2003", trust_remote_code=True)
    sentences, labels = [], []
    for split in ['train', 'validation', 'test']:
        for item in dataset[split]:
            sentences.append(item['tokens'])
            labels.append(item['ner_tags'])
    return sentences, labels

def preprocess_data(sentences, labels):
    """Preprocess the data by tokenizing and mapping labels."""
    processed_data = []
    for sentence, label in zip(sentences, labels):
        for word, tag in zip(sentence, label):
            processed_data.append((word, tag))
    return processed_data

def extract_features(sentences):
    """Extract features for each word."""
    feature_list = []
    for sentence in sentences:
        for word in sentence:
            features = {
                'word.lower()': word.lower(),
                'word.istitle()': word.istitle(),
                'suffix-2': word[-2:],
            }
            feature_list.append(features)
    return feature_list

def extract_features_for_sentence(sentence):
    """Extract features for each word in a single sentence."""
    features = []
    for i, word in enumerate(sentence):
        feature = {
            'word.lower()': word.lower(),
            'word.istitle()': word.istitle(),
            'suffix-2': word[-2:],
        }
        features.append(feature)
    return features

def map_labels(labels):
    """Map B, I tags to 1 and the rest to 0."""
    mapped_labels = []
    for label_list in labels:
        mapped = [1 if tag in [1, 2] else 0 for tag in label_list]  # 1 and 2 correspond to B- and I-
        mapped_labels.append(mapped)
    return mapped_labels

# Load and preprocess data
sentences, labels = load_data_from_huggingface()

# Reduce the dataset size for demonstration purposes (optional)
sentences = sentences[:5000]
labels = labels[:5000]

# Extract features and labels
feature_dicts = extract_features(sentences)
labels = map_labels(labels)

# Flatten feature dictionary and labels
flattened_features = feature_dicts
flattened_labels = [item for sublist in labels for item in sublist]

# Vectorize features with sparse representation
vectorizer = DictVectorizer(sparse=True)
X = vectorizer.fit_transform(flattened_features)
y = np.array(flattened_labels)

# Reduce dimensionality using Truncated SVD
svd = TruncatedSVD(n_components=100, random_state=42)
X_reduced = svd.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

# Train SVM model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions
y_pred = svm_model.predict(X_test)

# Evaluate model
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Interactive NER Prediction
print("\nModel trained. You can now input sentences for NER tagging.")
while True:
    user_input = input("\nEnter a sentence for NER tagging (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break

    # Tokenize and extract features for the input sentence
    tokens = word_tokenize(user_input)
    features = extract_features_for_sentence(tokens)

    # Vectorize features and reduce dimensions
    X_user = vectorizer.transform(features)
    X_user_reduced = svd.transform(X_user)

    # Predict NER tags
    predictions = svm_model.predict(X_user_reduced)

    # Map predictions to readable tags
    readable_tags = ['B/I' if tag == 1 else 'O' for tag in predictions]

    # Display results
    print("\nNER Predictions:")
    for token, tag in zip(tokens, readable_tags):
        print(f"{token}: {tag}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.99      0.97     12616
           1       0.57      0.17      0.26       911

    accuracy                           0.94     13527
   macro avg       0.76      0.58      0.61     13527
weighted avg       0.92      0.94      0.92     13527

Confusion Matrix:
 [[12504   112]
 [  760   151]]

Model trained. You can now input sentences for NER tagging.

Enter a sentence for NER tagging (or type 'exit' to quit): I am Nitesh Singh

NER Predictions:
I: O
am: O
Nitesh: O
Singh: O
