# SVM - EMBEDDINGS Gensim300 Test_Accuracy=93.61 F1=81.21

In [None]:
!pip install --upgrade numpy

In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import gensim
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV

In [2]:
def process_csv(csv):
    df = pd.read_csv(csv)
    #df=df.drop(columns=['Case','lemma', 'Gender','Number','State','Gloss','Proclitic 1','Proclitic 2','Proclitic 0','Aspect','Mood','Person','Voice','Enclitic 0'])
    df=df.drop_duplicates()
    df["POS"] = df["POS"].apply(lambda x: x.replace("Proper Noun", "ProperNoun"))
    df1 = pd.DataFrame(columns = ['x','y'])
    for i in df['sentence_id'].unique():
        df1 = df1.append({'x':df[df['sentence_id'] == i]['word'].tolist() , 'y':df[df['sentence_id'] == i]['POS'].tolist()}, ignore_index=True)
    return df1

In [3]:
csvs = os.listdir('Dataset/')
final_df = pd.DataFrame()
for csv in csvs:
    df = process_csv('Dataset/'+csv)
    final_df = pd.concat([final_df,df])
final_df = final_df.sample(frac = 1)
final_df.reset_index(drop=True, inplace=True)

  if (await self.run_code(code, result,  async_=asy)):


In [4]:
sentences = final_df['x'].tolist()
poss = final_df['y'].tolist()

In [5]:
WINDOW = 5

In [6]:
for i in range(len(sentences)):
    sentences[i] = ["" for i in range(int(WINDOW)//2)] + sentences[i] + ["" for i in range(int(WINDOW)//2)]
    poss[i] = ["PAD" for i in range(int(WINDOW)//2)] + poss[i] + ["PAD" for i in range(int(WINDOW)//2)]    

In [7]:
train_sents = sentences[:int(len(sentences)*0.7)]
val_sents = sentences[int(len(sentences)*0.7):int(len(sentences)*0.9)]
test_sents = sentences[int(len(sentences)*0.9):]
assert len(train_sents)+len(val_sents)+len(test_sents) == len(sentences)
train_pos = poss[:int(len(poss)*0.7)]
val_pos = poss[int(len(poss)*0.7):int(len(poss)*0.9)]
test_pos = poss[int(len(poss)*0.9):]
assert len(train_pos)+len(val_pos)+len(test_pos) == len(poss)

In [8]:
sents_train_val = train_sents.copy()
sents_train_val.extend(val_sents)
pos_train_val = train_pos.copy()
pos_train_val.extend(val_pos)

In [9]:
tags_list = []
for tags in pos_train_val:
    for tag in tags:
        if tag not in tags_list:
            tags_list.append(tag)

In [10]:
words_list = []
for words in sents_train_val:
    for word in words:
        if word not in words_list:
            words_list.append(word)

In [11]:
word_freq = {}
word_tags = {}
for i in words_list:
    tags = {}
    for j in tags_list:
        tags[j] = 0
    word_tags[i] = tags
    word_freq[i] = 0
for i in range(len(sents_train_val)):
    for j in range(len(sents_train_val[i])):
        word_freq[sents_train_val[i][j]] += 1
        word_tags[sents_train_val[i][j]][pos_train_val[i][j]] += 1

In [12]:
ambiguities = {}
for word in words_list:
    ambiguities[word] = [j for j in word_tags[word] if word_tags[word][j] !=0]
    assert word_freq[word] == sum(word_tags[word].values())

In [13]:
#import requests, zipfile
#from io import BytesIO
#response = requests.get("https://bakrianoo.ewr1.vultrobjects.com/aravec/full_grams_cbow_300_twitter.zip")
#zipDocument = zipfile.ZipFile(BytesIO(response.content))
#zipDocument.extractall()

In [14]:
embedding_model = gensim.models.Word2Vec.load('full_grams_cbow_300_twitter.mdl')

In [15]:
def OneHotEncoder(number, lenght):
    zero = np.zeros(lenght)
    zero[number] = 1
    return zero

def getFeatures(wordIdx, sentence, pos, w2v, tags, ambiguities, train=True):
    features = []
    
    keys = w2v.wv.key_to_index.keys()
    for i in reversed(range(1,int(WINDOW)//2+1)):
        if sentence[wordIdx-i] not in keys:
            features.append(np.zeros(w2v.vector_size))
        else:
            features.append(w2v.wv.get_vector(sentence[wordIdx-i], norm=True))
    
    if sentence[wordIdx] not in keys:
        features.append(np.zeros(w2v.vector_size))
    else:
        features.append(w2v.wv.get_vector(sentence[wordIdx], norm=True))
    
    for i in range(1,int(WINDOW)//2+1):
        if sentence[wordIdx+i] not in keys:
            features.append(np.zeros(w2v.vector_size))
        else:
            features.append(w2v.wv.get_vector(sentence[wordIdx+i], norm=True))
    if train:
        for i in reversed(range(1,int(WINDOW)//2+1)):
            tag = pos[wordIdx-i]
            features.append(OneHotEncoder(tags.index(tag),len(tags)))
            
        if sentence[wordIdx] in ambiguities:
            features.append(OneHotEncoder([tags.index(i) for i in ambiguities[sentence[wordIdx]]],len(tags)))
        else:
            features.append(OneHotEncoder([],len(tags)))

    else:
        for i in reversed(range(1,int(WINDOW)//2+1)):
            tag = pos[wordIdx-i]
            features.append(OneHotEncoder(tags.index(tag),len(tags)))
            
        if sentence[wordIdx] in ambiguities:
            features.append(OneHotEncoder([tags.index(i) for i in ambiguities[sentence[wordIdx]]],len(tags)))
        else:
            features.append(OneHotEncoder([],len(tags)))
    
    features.append([len(sentence[wordIdx])])
    
    flat_list = []
    for i in features:
        flat_list.extend(i)
    return flat_list

In [16]:
x_train = []
y_train = []
for i in tqdm(range(len(train_sents))):
    for j in range(int(WINDOW)//2,len(train_sents[i]) - int(WINDOW)//2):
        x_train.append(getFeatures(j , train_sents[i], train_pos[i], embedding_model, tags_list, ambiguities))
        y_train.append(train_pos[i][j])
x_train = np.array(x_train)
y_train = np.array(y_train)

x_val = []
y_val = []
for i in tqdm(range(len(val_sents))):
    for j in range(int(WINDOW)//2,len(val_sents[i]) - int(WINDOW)//2):
        x_val.append(getFeatures(j , val_sents[i], val_pos[i], embedding_model, tags_list, ambiguities))
        y_val.append(val_pos[i][j])
x_val = np.array(x_val)
y_val = np.array(y_val)

x_test = []
y_test = []
for i in tqdm(range(len(test_sents))):
    for j in range(int(WINDOW)//2,len(test_sents[i]) - int(WINDOW)//2):
        x_test.append(getFeatures(j , test_sents[i], test_pos[i], embedding_model, tags_list, ambiguities))
        y_test.append(test_pos[i][j])
x_test = np.array(x_test)
y_test = np.array(y_test)

100%|███████████████████████████████████████████████████████████████████████████| 10045/10045 [00:43<00:00, 231.62it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2870/2870 [00:08<00:00, 331.46it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1435/1435 [00:03<00:00, 420.80it/s]


In [17]:
print("Training data shape (x y) ", x_train.shape,y_train.shape)
print("Validation data shape (x y) ", x_val.shape,y_val.shape)
print("Test data shape (x y) ", x_test.shape,y_test.shape)

Training data shape (x y)  (181182, 1600) (181182,)
Validation data shape (x y)  (52531, 1600) (52531,)
Test data shape (x y)  (26013, 1600) (26013,)


## Training

In [18]:
#C = [1,10,100,1000]
#degree = np.arange(1, 7)

#params = {'C' : C,'degree' : degree}

#random_search = RandomizedSearchCV(estimator = SVC(),param_distributions = params,n_iter = 50,n_jobs = 6,verbose=1).fit(x_train, y_train)

#clf = random_search.best_estimator_

### SVM  C=1.0 Kernel=rbf

In [19]:
clf1 = SVC(C=1)
clf1.fit(x_train, y_train)

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [20]:
train_pred = clf1.predict(x_train)
train_acc = sum(train_pred == y_train) *1.0 / len(train_pred)
print('Train Acc',train_acc*100)
train_f1 = f1_score(y_train, train_pred, average='macro')
print('Train F1',train_f1*100)

Train Acc 98.64445695488514
Train F1 88.45780515174259


In [21]:
val_pred = clf1.predict(x_val)
val_acc = sum(val_pred == y_val) *1.0 / len(val_pred)
print('Val Acc',val_acc*100)
val_f1 = f1_score(y_val, val_pred, average='macro')
print('Val F1',val_f1*100)

Val Acc 98.53610249186194
Val F1 88.71133609733224


In [22]:
test_pred = clf1.predict(x_test)
test_acc = sum(test_pred == y_test) *1.0 / len(test_pred)
print('Test Acc',test_acc*100)
test_f1 = f1_score(y_test, test_pred, average='macro')
print('Test F1',test_f1*100)

Test Acc 93.61088686426018
Test F1 81.21378798536703


### SVM  C=10.0 Kernel=rbf

In [23]:
clf2 = SVC(C=10)
clf2.fit(x_train, y_train)

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [24]:
train_pred = clf2.predict(x_train)
train_acc = sum(train_pred == y_train) *1.0 / len(train_pred)
print('Train Acc',train_acc*100)
train_f1 = f1_score(y_train, train_pred, average='macro')
print('Train F1',train_f1*100)

Train Acc 99.44365334304732
Train F1 97.71368176903825


In [25]:
val_pred = clf2.predict(x_val)
val_acc = sum(val_pred == y_val) *1.0 / len(val_pred)
print('Val Acc',val_acc*100)
val_f1 = f1_score(y_val, val_pred, average='macro')
print('Val F1',val_f1*100)

Val Acc 98.8482990995793
Val F1 92.3807055667022


In [26]:
test_pred = clf2.predict(x_test)
test_acc = sum(test_pred == y_test) *1.0 / len(test_pred)
print('Test Acc',test_acc*100)
test_f1 = f1_score(y_test, test_pred, average='macro')
print('Test F1',test_f1*100)

Test Acc 93.9068927074924
Test F1 80.33619100661515


### SVM  C=100.0 Kernel=rbf

In [27]:
clf3 = SVC(C=100)
clf3.fit(x_train, y_train)

SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [28]:
train_pred = clf3.predict(x_train)
train_acc = sum(train_pred == y_train) *1.0 / len(train_pred)
print('Train Acc',train_acc*100)
train_f1 = f1_score(y_train, train_pred, average='macro')
print('Train F1',train_f1*100)

Train Acc 99.91941804373504
Train F1 99.6846514498401


In [29]:
val_pred = clf3.predict(x_val)
val_acc = sum(val_pred == y_val) *1.0 / len(val_pred)
print('Val Acc',val_acc*100)
val_f1 = f1_score(y_val, val_pred, average='macro')
print('Val F1',val_f1*100)

Val Acc 98.69981534712836
Val F1 92.41075503881393


In [30]:
test_pred = clf3.predict(x_test)
test_acc = sum(test_pred == y_test) *1.0 / len(test_pred)
print('Test Acc',test_acc*100)
test_f1 = f1_score(y_test, test_pred, average='macro')
print('Test F1',test_f1*100)

Test Acc 93.54169069311497
Test F1 80.79178399458982


### Best SVM C=1 kernel=rbf Test_accuracy=93.61 F1=81.21

In [31]:
clf = clf1

## Predict

In [32]:
def pos_tagger(w, sent):
    words = sent.split()
    words = ["" for i in range(int(w)//2)] + words + ["" for i in range(int(w)//2)]
    pos = ["PAD" for i in range(int(w)//2)]
    for i in range(int(WINDOW)//2,len(words) - int(WINDOW)//2):
        feature = np.array(getFeatures(i , words, pos, embedding_model, tags_list, ambiguities, train = False)).reshape(1,-1)
        tag = clf.predict(feature)
        pos.append(tag[0])
    return pos[2:]

In [33]:
sentence = "جون يحب البيت الأزرق في نهاية الشارع"
output = pos_tagger(WINDOW ,sentence)
pred_tags = [(sentence.split()[i],output[i]) for i in range(len(sentence.split()))]
for w,t in pred_tags:
    print(w,t)

جون ProperNoun
يحب Verb
البيت Noun
الأزرق Noun
في Preposition
نهاية Noun
الشارع Noun


In [34]:
from evaluate_pred import *
evaluate(sentence, pred_tags)

100.0