In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import gensim
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV

In [None]:
df = pd.read_csv('final_POS_data.csv')
function = lambda s: [s["word"].values.tolist(), s["tag"].values.tolist()]
grouped = df.groupby("sentence_id").apply(function)
items = [s for s in grouped]
final_df = pd.DataFrame(items,columns = ['x','y'])

In [None]:
WINDOW = 4

In [None]:
sentences = final_df['x'].tolist()
poss = final_df['y'].tolist()

In [None]:
for i in range(len(sentences)):
    sentences[i] = ["" for i in range(int(WINDOW)//2)] + sentences[i] + ["" for i in range(int(WINDOW)//2)]
    poss[i] = ["PAD" for i in range(int(WINDOW)//2)] + poss[i] + ["PAD" for i in range(int(WINDOW)//2)]    

In [None]:
train_sents = sentences[:int(len(sentences)*0.7)]
val_sents = sentences[int(len(sentences)*0.7):int(len(sentences)*0.9)]
test_sents = sentences[int(len(sentences)*0.9):]
assert len(train_sents)+len(val_sents)+len(test_sents) == len(sentences)
train_pos = poss[:int(len(poss)*0.7)]
val_pos = poss[int(len(poss)*0.7):int(len(poss)*0.9)]
test_pos = poss[int(len(poss)*0.9):]
assert len(train_pos)+len(val_pos)+len(test_pos) == len(poss)

In [None]:
sents_train_val = train_sents.copy()
sents_train_val.extend(val_sents)
pos_train_val = train_pos.copy()
pos_train_val.extend(val_pos)

In [None]:
tags_list = []
for tags in pos_train_val:
    for tag in tags:
        if tag not in tags_list:
            tags_list.append(tag)

In [None]:
words_list = []
for words in sents_train_val:
    for word in words:
        if word not in words_list:
            words_list.append(word)

In [None]:
word_freq = {}
word_tags = {}
for i in words_list:
    tags = {}
    for j in tags_list:
        tags[j] = 0
    word_tags[i] = tags
    word_freq[i] = 0
for i in range(len(sents_train_val)):
    for j in range(len(sents_train_val[i])):
        word_freq[sents_train_val[i][j]] += 1
        word_tags[sents_train_val[i][j]][pos_train_val[i][j]] += 1

In [None]:
ambiguities = {}
for word in words_list:
    ambiguities[word] = [j for j in word_tags[word] if word_tags[word][j] !=0]
    assert word_freq[word] == sum(word_tags[word].values())

In [None]:
embedding_model = gensim.models.Word2Vec.load('full_grams_cbow_100_twitter.mdl')

In [None]:
def OneHotEncoder(number, lenght):
    zero = np.zeros(lenght)
    zero[number] = 1
    return zero

def getFeatures(wordIdx, sentence, pos, w2v, tags, ambiguities, train=True):
    features = []
    
    keys = w2v.wv.key_to_index.keys()
    for i in reversed(range(1,int(WINDOW)//2+1)):
        if sentence[wordIdx-i] not in keys:
            features.append(np.zeros(w2v.vector_size))
        else:
            features.append(w2v.wv.get_vector(sentence[wordIdx-i], norm=True))
    
    if sentence[wordIdx] not in keys:
        features.append(np.zeros(w2v.vector_size))
    else:
        features.append(w2v.wv.get_vector(sentence[wordIdx], norm=True))
    
    for i in range(1,int(WINDOW)//2+1):
        if sentence[wordIdx+i] not in keys:
            features.append(np.zeros(w2v.vector_size))
        else:
            features.append(w2v.wv.get_vector(sentence[wordIdx+i], norm=True))
    if train:
        for i in reversed(range(1,int(WINDOW)//2+1)):
            tag = pos[wordIdx-i]
            features.append(OneHotEncoder(tags.index(tag),len(tags)))
            
        if sentence[wordIdx] in ambiguities:
            features.append(OneHotEncoder([tags.index(i) for i in ambiguities[sentence[wordIdx]]],len(tags)))
        else:
            features.append(OneHotEncoder([],len(tags)))

    else:
        for i in reversed(range(1,int(WINDOW)//2+1)):
            tag = pos[wordIdx-i]
            features.append(OneHotEncoder(tags.index(tag),len(tags)))
            
        if sentence[wordIdx] in ambiguities:
            features.append(OneHotEncoder([tags.index(i) for i in ambiguities[sentence[wordIdx]]],len(tags)))
        else:
            features.append(OneHotEncoder([],len(tags)))
    
    features.append([len(sentence[wordIdx])])
    
    flat_list = []
    for i in features:
        flat_list.extend(i)
    return flat_list

In [None]:
encoder = LabelEncoder()
x_train = []
y_train = []
for i in tqdm(range(len(train_sents))):
    for j in range(int(WINDOW)//2,len(train_sents[i]) - int(WINDOW)//2):
        x_train.append(getFeatures(j , train_sents[i], train_pos[i], embedding_model, tags_list, ambiguities,train=True))
        y_train.append(train_pos[i][j])
x_train = np.array(x_train)
y_train = np.array(y_train)
y_train = encoder.fit_transform(y_train)

x_val = []
y_val = []
for i in tqdm(range(len(val_sents))):
    for j in range(int(WINDOW)//2,len(val_sents[i]) - int(WINDOW)//2):
        x_val.append(getFeatures(j , val_sents[i], val_pos[i], embedding_model, tags_list, ambiguities))
        y_val.append(val_pos[i][j])
x_val = np.array(x_val)
y_val = np.array(y_val)
y_val = encoder.transform(y_val)

x_test = []
y_test = []
for i in tqdm(range(len(test_sents))):
    for j in range(int(WINDOW)//2,len(test_sents[i]) - int(WINDOW)//2):
        x_test.append(getFeatures(j , test_sents[i], test_pos[i], embedding_model, tags_list, ambiguities))
        y_test.append(test_pos[i][j])
x_test = np.array(x_test)
y_test = np.array(y_test)
y_test = encoder.transform(y_test)

In [None]:
print("Training data shape (x y) ", x_train.shape,y_train.shape)
print("Validation data shape (x y) ", x_val.shape,y_val.shape)
print("Test data shape (x y) ", x_test.shape,y_test.shape)

In [None]:
import h5py

f1 = h5py.File("data.hdf5", "w")
dset1 = f1.create_dataset("x_train", x_train.shape , dtype='f', data=x_train)
dset1 = f1.create_dataset("y_train", y_train.shape , dtype='i', data=y_train)
dset1 = f1.create_dataset("x_val", x_val.shape , dtype='f', data=x_val)
dset1 = f1.create_dataset("y_val", y_val.shape , dtype='i', data=y_val)
dset1 = f1.create_dataset("x_test", x_test.shape , dtype='f', data=x_test)
dset1 = f1.create_dataset("y_test", y_test.shape , dtype='i', data=y_test)
f1.close()

In [3]:
import h5py
f2 = h5py.File('data.hdf5', 'r')
x_train = f2['x_train']
x_train = x_train[:]
x_val = f2['x_val']
x_val = x_val[:]
x_test = f2['x_test']
x_test = x_test[:]
y_train = f2['y_train']
y_train = y_train[:]
y_val = f2['y_val']
y_val = y_val[:]
y_test = f2['y_test']
y_test = y_test[:]

In [4]:
print("Training data shape (x y) ", x_train.shape,y_train.shape)
print("Validation data shape (x y) ", x_val.shape,y_val.shape)
print("Test data shape (x y) ", x_test.shape,y_test.shape)

Training data shape (x y)  (832429, 606) (832429,)
Validation data shape (x y)  (142833, 606) (142833,)
Test data shape (x y)  (42477, 606) (42477,)


In [5]:
from sklearn.svm import SVC

In [6]:
clf1 = SVC(C=100)
clf1.fit(x_train, y_train)

SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
train_pred = clf1.predict(x_train)
train_acc = sum(train_pred == y_train) *1.0 / len(train_pred)
print('Train Acc',train_acc*100)
train_f1 = f1_score(y_train, train_pred, average='macro')
print('Train F1',train_f1*100)

In [None]:
val_pred = clf1.predict(x_val)
val_acc = sum(val_pred == y_val) *1.0 / len(val_pred)
print('Val Acc',val_acc*100)
val_f1 = f1_score(y_val, val_pred, average='macro')
print('Val F1',val_f1*100)

In [11]:
test_pred = clf1.predict(x_test)
test_acc = sum(test_pred == y_test) *1.0 / len(test_pred)
print('Test Acc',test_acc*100)
test_f1 = f1_score(y_test, test_pred, average='macro')
print('Test F1',test_f1*100)

Test Acc 93.76603809120229
Test F1 76.64003553665913


In [9]:
import pickle
filename = 'svm_pos_tag.pickle'
pickle.dump(clf1, open(filename, 'wb'))