In [10]:
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from datasets import load_dataset
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [2]:

# Load the dataset
dataset = load_dataset('batterydata/pos_tagging')

# Extract sentences and tags
sentences = dataset['train']['words'][:1000]
tags = dataset['train']['labels'][:1000]

testingSentences = dataset['test']['words']
testingTags = dataset['test']['labels']

wordDataset = dataset['train']['words'] + dataset['test']['words']
wordDataset = [[word.lower() for word in sentence] for sentence in wordDataset]

# Train a Word2Vec model to obtain word embeddings
# The model trains on entire dataset, but does not have POS embeddings for the dataset
# POS embeddings will come from the SVM model
w2v_model = Word2Vec(wordDataset, vector_size=100, window=5, min_count=1, workers=4, epochs=50)




In [3]:
def wordToVec(word, prevW, nextW):
    word = word.lower()
    if word in w2v_model.wv:
        return w2v_model.wv[word]
    else:
        prevW = prevW.lower()
        nextW = nextW.lower()
        prevArray = w2v_model.wv[prevW] if prevW in w2v_model.wv else np.zeros(100)
        nextArray = w2v_model.wv[nextW] if nextW in w2v_model.wv else np.zeros(100)
        return np.mean([prevArray, nextArray], axis=0)

In [9]:
# Function to convert sentences to feature vectors
def sentence_to_features(sentence):
    x = []
    for ind, word in enumerate(sentence):
        prevWord = sentence[ind-1] if ind > 0 else ''
        nextWord = sentence[ind+1] if ind < len(sentence)-1 else ''
        wordArray = wordToVec(word, prevWord, nextWord)
        
        # gets prev2
        prev2 = sentence[ind-2] if ind > 1 else ''
        next2 = sentence[ind+2] if ind < len(sentence)-2 else ''

        otherFeatures = np.array([
            word.isupper(), #is upper
            word.istitle(), #is title
            word.isdigit(), #is digit
            len(word), #length of word
            
        ])
        #adds previous and next word vectors
        prevWordArray = wordToVec(prevWord, prev2, word)
        nextWordArray = wordToVec(nextWord, word, next2)
        
        wordArray = np.concatenate((wordArray,otherFeatures))
        wordArray = np.concatenate((wordArray,prevWordArray))
        wordArray = np.concatenate((wordArray,nextWordArray))
        
        x.append(wordArray)
    return np.array(x)

In [4]:
# Prepare feature vectors and corresponding labels
X = [sentence_to_features(sentence) for sentence in sentences]
XTesting = [sentence_to_features(sentence) for sentence in testingSentences]
y = [tag for tag_list in tags for tag in tag_list]
yTesting = [tag for tag_list in testingTags for tag in tag_list]

# Flatten the list of feature vectors and labels
X = np.vstack(X)
y = np.array(y)

XTesting = np.vstack(XTesting)
yTesting = np.array(yTesting)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Train the SVM classifier
svm_clf = SVC(kernel='linear', C=1)
svm_clf.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_clf.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           #       1.00      1.00      1.00         2
           $       1.00      1.00      1.00        33
          ''       1.00      1.00      1.00        32
           (       1.00      1.00      1.00         3
           )       1.00      1.00      1.00         3
           ,       1.00      1.00      1.00       246
           .       1.00      1.00      1.00       187
           :       1.00      1.00      1.00        14
          CC       0.99      1.00      1.00       110
          CD       0.88      0.96      0.92       181
          DT       1.00      0.99      0.99       436
          EX       1.00      1.00      1.00         2
          FW       0.00      0.00      0.00         1
          IN       0.97      0.99      0.98       510
          JJ       0.70      0.76      0.73       319
         JJR       0.86      0.82      0.84        22
         JJS       0.42      0.83      0.56         6
          MD       1.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
yTestingPred = svm_clf.predict(XTesting)
print(accuracy_score(yTesting, yTestingPred))
