### -*- Nuha Alghamdi -*-
### -*- nuhaalghamdi92@gmail.com -*-
### -*- Feb 22 2020-*-

* Here we use theofficial one by Facebook
* You can download the former from [here](https://pypi.org/project/fasttext/)
* You can download the latter from [here](https://pypi.org/project/gensim/)
* Special thanks to Aziz Altowayan for using some functions from [his repository](https://github.com/iamaziz/ar-embeddings/blob/master/asa.py).

# Facebook fastText library:

In [6]:
import fastText
import numpy as np
import pandas as pd

Load wiki.ar model:

In [4]:
%%time
#Write your own model path
mpath = 'D:\\dataset\\wiki.ar.bin'
m = fastText.load_model(mpath)

Wall time: 51.5 s


Functions to generate vectors for text:

In [5]:
def vw(w):
    """return word vector"""
    return m.get_word_vector(w)

def vs(s):
    """return sentence vector"""
    return m.get_sentence_vector(s)

In [10]:
#Put data in dataframe
cols = ['data','label']
#write your own dataset path
df=pd.read_csv('D:\\dataset\\dataset16Feb.csv', names=cols)

In [11]:
df[:3]

Unnamed: 0,data,label
0,كرة القدم رياضة جماعية تلعب فريقين يتكون منهما...,رياضة
1,كمال الأجسام لعبة رياضية نشأت أواخر القرن التا...,رياضة
2,أديداس بالألمانية شركة ملابس رياضية مقرها ألما...,رياضة


In [12]:
#Shuffle data
df=df.sample(frac=1).reset_index(drop=True)

In [13]:
df[:3]

Unnamed: 0,data,label
0,المدارس الصناعية تأسست عام بوهيميا قبل الأسقف ...,عمل
1,طب الأسنان فرع فروع الطب يختص بدراسة وتشخيص وم...,صحة
2,صيدلية المستشفى صيدلية توجد عادة داخل مبنى الم...,صحة


In [14]:
#Prepare variables
dimension = 300 # vector dimension
len_examples = df.shape[0]  #no. of articles in the dataset

# all vectors initialized with zeros
vecs = np.zeros((len_examples, dimension), dtype="float32")

## Get sentence vectors:

In [15]:
%%time
for i in range(len_examples):
    try:
        vecs[i] = vs(df['data'].iloc[i])
    except:
        pass

Wall time: 39.2 s


In [16]:
#To make labels as digits from 0 to 4 (five labels)
digit_to_label=dict(enumerate(set(df['label']))) 
label_to_digit={v:k for k,v in digit_to_label.items()}

In [17]:
#X refers to vectors and y is their labels
X= vecs
y=np.array(df['label'].apply(lambda l:label_to_digit[l] ))

## Save the sentence vectors and their labels in your folder:

In [None]:
#X_vecs_fastText_vs is the name of the file that contains the generated sentence vectors by official fastText
#y_labels_fastText_vs is the name of the file that contains their labels
np.save('X_vecs_fastText_vs', X)
np.save('y_labels_fastText_vs', y)

## Get Average word vectors:

Tokenize the articles:

In [None]:
%%time
import nltk
tokenized_data=[]
tokenized_words=[]
for i in range(len_examples):
    tokenized_words = nltk.word_tokenize(df['data'].iloc[i])
    tokenized_data.append(tokenized_words)
    tokenized_words=[]

In [19]:
#Just to make sure of the number of articles (1909 articles)
#print(len(tokenized_data))

Next four cells are taking the tokenized data and the fastText model to generate vector for each word then average the vectors of the same article to get one vector for each article:

In [None]:
from logging import info, INFO

In [None]:
def feature(words,pretrainedmodel):
    """average words' vectors"""
    dimension=300
    embeddings=pretrainedmodel
    feature_vec = np.zeros((dimension,), dtype="float32")
    retrieved_words = 0
    for token in words:
        try:
            feature_vec = np.add(feature_vec,embeddings.get_word_vector(token))
            retrieved_words += 1
        except KeyError:
            pass  # if a word is not in the embeddings' vocabulary discard it

    np.seterr(divide='ignore', invalid='ignore')
    feature_vec = np.divide(feature_vec, retrieved_words)

    return feature_vec

In [None]:
def average_feature_vectors(examples,pretrainedmodel, type_='NaN'):
    """
    :param examples: a list of lists (each list contains words) e.g. [['hi','do'], ['you','see'], ... ]
    :param type_: (optional) type of examples text e.g. train / test
    :return: the average word vector of each list
    """
    dimension=300
    embeddings=pretrainedmodel
    feature_vectors = np.zeros((len(examples), dimension), dtype="float32")
    info("Vectorizing {} tokens ..".format(type_))
    for i, example in enumerate(examples):
        feature_vectors[i] = feature(example,embeddings)

    info(" ... total {} {}".format(len(feature_vectors), type_))

    return feature_vectors

In [None]:
%%time
avg_vw=average_feature_vectors(tokenized_data,m) #average word vectors

## Save the average word vectors and their labels in your folder:

In [None]:
X=avg_vw
np.save('X_vecs_fastText_vw', X)
np.save('y_labels_fastText_vw', y)