In [26]:
# Import necessary libraries
import nltk
from tqdm.notebook import tqdm
from datasets import load_dataset
from nltk.corpus import stopwords
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from nltk.tokenize import word_tokenize
from string import punctuation
import numpy as np
from sklearn.preprocessing import StandardScaler
import pickle

# Download NLTK resources
nltk.download('punkt')
nltk.download("stopwords")

# Set seed for reproducibility
SEED = 0
FEATURES_COUNT = 6
SW = stopwords.words("english")
PUNCT = list(punctuation)


In [27]:
# Function to create feature vector for a word
def vectorize_word(w, scaled_position):
    v = np.zeros(FEATURES_COUNT).astype(np.float32)
    title = int(w[0].isupper())
    allcaps = int(w.isupper())
    sw = int(w.lower() in SW)
    punct = int(w in PUNCT)
    return [title, allcaps, len(w), sw, punct, scaled_position]


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/harshvive14/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/harshvive14/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
# Function to process and create dataset
def create_data_set(data):
    words = []
    features = []
    labels = []
    for d in tqdm(data):
        tags = d["ner_tags"]
        tokens = d["tokens"]
        for i in range(len(tokens)):
            x = vectorize_word(tokens[i], i / len(tokens))
            y = int(tags[i] > 0)
            features.append(x)
            labels.append(y)
        words += tokens
    words = np.asarray(words, dtype="object")
    features = np.asarray(features, dtype=np.float32)
    labels = np.asarray(labels, dtype=np.float32)
    return words, features, labels


In [29]:
# Function to train SVM model
def train_svm_model(X_train, y_train, scaler, C=1.0, kernel='linear', class_weight='balanced', random_state=SEED):
    X_train_scaled = scaler.transform(X_train)
    model = SVC(C=C, kernel=kernel, class_weight=class_weight, random_state=random_state, verbose=True)
    model.fit(X_train_scaled, y_train)
    return model


Found cached dataset conll2003 (/Users/harshvive14/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/14041 [00:00<?, ?it/s]

  0%|          | 0/3250 [00:00<?, ?it/s]

  0%|          | 0/3453 [00:00<?, ?it/s]

[LibSVM]..................
*.........
*
optimization finished, #iter = 27236
obj = -15443.285139, rho = -1.088682
nSV = 17088, nBSV = 9558
Total nSV = 17088
              precision    recall  f1-score   support

         0.0       0.99      0.96      0.98     42759
         1.0       0.82      0.97      0.89      8603

    accuracy                           0.96     51362
   macro avg       0.91      0.96      0.93     51362
weighted avg       0.96      0.96      0.96     51362



In [None]:
# Function to save trained model
def save_model(model, model_name):
    pickle.dump(model, open(model_name, 'wb'))


In [None]:
# Main execution cell
if __name__ == "__main__":
    # Load dataset
    data = load_dataset("conll2003")
    data_train = data["train"] 
    data_val = data["validation"]
    
    # Process training and validation data
    words_train, X_train, y_train = create_data_set(data_train)
    words_val, X_val, y_val = create_data_set(data_val)

    # Initialize and fit StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_train)

    # Train SVM model
    model = train_svm_model(X_train, y_train, scaler)

    # Save trained model and scaler
    nei_model_name = 'nei_model.sav'
    scaler_model_name = 'scaler_model.sav'
    save_model(model, nei_model_name)
    save_model(scaler, scaler_model_name)

    # Transform validation data and make predictions
    X_val_scaled = scaler.transform(X_val)
    y_pred_val = model.predict(X_val_scaled)

    # Print classification report
    print(classification_report(y_true=y_val, y_pred=y_pred_val))
