In [1]:
import numpy as np
import pandas as pd
import gensim
import re
from pyarabic import araby
import matplotlib.pyplot as plt
from evaluator import evaluator
from pretrained.AraVec import AraVec


In [2]:
def normalize(text):
    text = araby.strip_harakat(text)
    text = araby.strip_tashkeel(text)
    text = araby.strip_small(text)
    text = araby.strip_tatweel(text)
    text = araby.strip_shadda(text)
    text = araby.strip_diacritics(text)
    text = araby.normalize_ligature(text)
    #text = araby.normalize_hamza(text)
    text = araby.normalize_teh(text)
    text = araby.normalize_alef(text)
    return text

def strip_all(text):
    l = [' ', '0', '1', '2', '3', '4', '5', '6',
       '7', '8', '9', '?', '.', '.'
       '؟', 'ء', 'ؤ', 'ئ', 'ا', 'ب', 'ت', 'ث',
       'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ',
       'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ي', '٠', '١',
       '٢', '٣', '٤', '٥', '٦', '٧', '٨', '٩']
    return "".join([x for x in text if x in l])
def preprocess(text):
    text = normalize(text)
    text = strip_all(text)
    return text

In [3]:
from utils.tokenizer import tokenization
train_pos = pd.read_csv("data/train_Arabic_tweets_positive_20190413.tsv", sep='\t', names=["label", "tweet"])
train_neg = pd.read_csv("data/train_Arabic_tweets_negative_20190413.tsv", sep='\t', names=["label", "tweet"])
train = pd.concat([train_pos, train_neg]).sample(frac=1.0, random_state=0)
test_pos = pd.read_csv("data/train_Arabic_tweets_positive_20190413.tsv", sep='\t', names=["label", "tweet"])
test_neg = pd.read_csv("data/train_Arabic_tweets_negative_20190413.tsv", sep='\t', names=["label", "tweet"])
test = pd.concat([test_pos, test_neg]).sample(frac=1.0, random_state=0)

train.tweet = train.tweet.apply(preprocess).apply(tokenization).apply(lambda x: [n for c in x.tokens for n in c])

sentences = np.concatenate([train.tweet.values, test.tweet.values])

In [9]:
aravec = AraVec()

word2vec_model_sg = gensim.models.Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4, seed=0, sg=1)
word2vec_model_sg.build_vocab(sentences) 
word2vec_model_sg.train(sentences, total_examples=word2vec_model_sg.corpus_count, epochs=15)

embeddings_index = aravec.get_embedding_matrix(word2vec_model_sg)
wevaluator = evaluator(embeddings_index, preprocess)
scores = wevaluator.evaluate()

Word Similarity (MSE):  0.34109646793783466
Concept Categorization) (acc):  0.0823907941410099
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Sentiment Analysis (acc):  0.7619489354183232


In [4]:
aravec = AraVec()
#model_path = aravec.get_model("Twitter_SkipGram_100", unzip=True)
#model = aravec.load_model(model_path)
model = aravec.load_model("full_grams_sg_100_twitter/full_grams_sg_100_twitter.mdl")

embeddings_index = aravec.get_embedding_matrix(model)
wevaluator = evaluator(embeddings_index, preprocess)
scores = wevaluator.evaluate()

In [6]:
from models.glove import Glove

num_tokens = np.unique(np.array([y for x in sentences for y in x])).shape[0]
vocab = [x for y in train.tweet.values for x in y ]
vocab.extend([x for y in test.tweet.values for x in y ])
vocab = np.unique(np.array(vocab))
word_index = {w: i for i, w in enumerate(vocab)}

index_word = {i: w for i, w in enumerate(vocab)}

sentence_inds = []
for s in sentences:
    si = []
    for t in s:
        si.append(word_index[t])
    sentence_inds.append(si)
    
    
gmodel = Glove()
model = gmodel.train(sentence_inds, num_tokens, 100, 5, 5, 2048)
model.get_layer("embedding_4").get_weights()[0].shape

embeddings_index = {}
embeds = model.get_layer("embedding_4").get_weights()[0]
for idx in range(list(index_word.keys())[-1]):
    embeddings_index[index_word[idx]] = embeds[idx]
    
wevaluator = evaluator(embeddings_index, preprocess)
scores = wevaluator.evaluate()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


ValueError: No such layer: embedding.

In [8]:
model.get_layer("embedding_4").get_weights()[0].shape

embeddings_index = {}
embeds = model.get_layer("embedding_4").get_weights()[0]
for idx in range(list(index_word.keys())[-1]):
    embeddings_index[index_word[idx]] = embeds[idx]
    
wevaluator = evaluator(embeddings_index, preprocess)
scores = wevaluator.evaluate()

Word Similarity (MSE):  0.5322430952245734
Concept Categorization) (acc):  0.05540086960722457
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Sentiment Analysis (acc):  0.75731071649439


In [6]:
from models.W2V import W2V

num_tokens = np.unique(np.array([y for x in sentences for y in x])).shape[0]
vocab = [x for y in train.tweet.values for x in y ]
vocab.extend([x for y in test.tweet.values for x in y ])
vocab = np.unique(np.array(vocab))
word_index = {w: i for i, w in enumerate(vocab)}

index_word = {i: w for i, w in enumerate(vocab)}

sentence_inds = []
for s in sentences:
    si = []
    for t in s:
        si.append(word_index[t])
    sentence_inds.append(si)
    
cont_seq = [y for x in sentence_inds for y in x]    
w2v_model = W2V()
model = w2v_model.train(np.array(cont_seq), window_size=5, vocab_size=num_tokens+1, vector_dim=100, lr=0.05, negative_samples=2, batch_size=2048, epochs=5, verbose=1)


Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [7]:

embeddings_index = {}
embeds = model.get_layer("word_embedding").get_weights()[0]
for idx in range(list(index_word.keys())[-1]):
    embeddings_index[index_word[idx]] = embeds[idx]
    
wevaluator = evaluator(embeddings_index, preprocess)
scores = wevaluator.evaluate()

Word Similarity (MSE):  0.5871207932232969
Concept Categorization) (acc):  0.05228469636202186
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Sentiment Analysis (acc):  0.7587242689283505
