### -*- Nuha Alghamdi -*-
### -*- nuhaalghamdi92@gmail.com -*-
### -*- Feb 22 2020-*-

* Here we use Gensim fastText.
* You can download the library from [here](https://pypi.org/project/gensim/)
* Special thanks to Aziz Altowayan for using some functions from [his repository](https://github.com/iamaziz/ar-embeddings/blob/master/asa.py).

# Gensim fastText library:

In [1]:
from gensim.models.fasttext import FastText as FT_gensim
import numpy as np
import pandas as pd



Load wiki.ar model:

In [2]:
%%time
mpath = 'D:\\dataset\\wiki.ar.bin'
m1= FT_gensim.load_fasttext_format(mpath)

Wall time: 5min 8s


In [None]:
#Put data in dataframe
cols = ['data','label']
#write your own dataset path
df=pd.read_csv('D:\\dataset\\dataset16Feb.csv', names=cols)

In [None]:
#Shuffle data
df=df.sample(frac=1).reset_index(drop=True)

## Get sentence vectors:

In [None]:
#Function to get vector for an article (as one sentence)
def get_vector(txt):
    vec=m1[txt]
    
    return vec

In [None]:
#Prepare variables to put vectors in
dimension = 300 # vector dimension
len_examples = df.shape[0] #no. of examples

# all vectors
vecs = np.zeros((len_examples, dimension), dtype="float32")

Build the vectors:

In [None]:
%%time
for i in range(len_examples):
    try:
        vecs[i] = get_vector(df['data'].iloc[i])
    except:
        pass

In [None]:
#to make the labels in numbers from 0 to 4 instead of text
digit_to_label=dict(enumerate(set(df['label'])))
label_to_digit={v:k for k,v in digit_to_label.items()}

In [None]:
X= vecs
y=np.array(df['label'].apply(lambda l:label_to_digit[l] ))

## Save the sentence vectors and their labels in your folder:

In [None]:
#X_vecs_gen is the file name contains sentence vectors generated by gensim fastText
#y_labels_gen is the file name contains their labels
np.save('X_vecs_gen', X) 
np.save('y_labels_gen', y)

## Get Average word vectors:

The next 5 cells are functions to tokenize the articles then get vector for each word in each articlee then average the vectors of each article to get one vector representing it

In [None]:
from logging import info, INFO

In [None]:
def feature(words,pretrainedmodel):
    """average words' vectors"""
    dimension=300
    embeddings=pretrainedmodel
    feature_vec = np.zeros((dimension,), dtype="float32")
    retrieved_words = 0
    for token in words:
        try:
            feature_vec = np.add(feature_vec,embeddings[token])
            retrieved_words += 1
        except KeyError:
            pass  # if a word is not in the embeddings' vocabulary discard it

    np.seterr(divide='ignore', invalid='ignore')
    feature_vec = np.divide(feature_vec, retrieved_words)

    return feature_vec

In [None]:
def average_feature_vectors(examples,pretrainedmodel, type_='NaN'):
    """
    :param examples: a list of lists (each list contains words) e.g. [['hi','do'], ['you','see'], ... ]
    :param type_: (optional) type of examples text e.g. train / test
    :return: the average word vector of each list
    """
    dimension=300
    embeddings=pretrainedmodel
    feature_vectors = np.zeros((len(examples), dimension), dtype="float32")
    info("Vectorizing {} tokens ..".format(type_))
    for i, example in enumerate(examples):
        feature_vectors[i] = feature(example,embeddings)

    info(" ... total {} {}".format(len(feature_vectors), type_))

    return feature_vectors

In [None]:
%%time
import nltk
tokenized_data=[]
tokenized_words=[]
for i in range(len_examples):
    tokenized_words = nltk.word_tokenize(df['data'].iloc[i])
    tokenized_data.append(tokenized_words)
    tokenized_words=[]

In [None]:
%%time
avg_vw=average_feature_vectors(tokenized_data,m1)

## Save the average word vectors and their labels in your folder:

In [None]:
X=avg_vw
np.save('X_vecs_tokenized_gen', X)
np.save('y_labels_tokenized_gen', y)