In [8]:
from __future__ import division
import argparse
import pandas as pd

# useful stuff
import numpy as np
from scipy.special import expit
from sklearn.preprocessing import normalize
from collections import Counter
from nltk import stem
from string import digits
from tqdm import tqdm
import time
from datetime import timedelta, datetime
import json
from sklearn.decomposition import PCA

Preprocessing and load text

Text Cleaning - First, as python is a case sensitive language, I put every word in lowercase, then I did some classic processing such as removing:
-	Punctuation
-	Digits
-	Hyphens
-	‘s and ‘t
-	‘empty’ words (words composed of only one letter) 


In [26]:
def text2sentences(path):
    # feel free to make a better tokenization/pre-processing
    sentences = []
    stemmed_word_list = []
    stemmer=stem.snowball.EnglishStemmer()
    with open(path, encoding='utf8') as f:
        for l in f:
            sent = l.lower().split()
            sent_new = [word.translate(str.maketrans('','','''!"#$%&()*+,./:;<=>?@[\]^_`{|}~''')) for word in sent]
            sent_new_1 = [word.translate(str.maketrans('', '', digits)) for word in sent_new] # clean digits
            sent_new_2 = [word.replace('-','') for word in sent_new_1] # clean hyphen 
            sent_new_3 = [word.replace("'s",'') for word in sent_new_2] # clean 's
            sent_new_4 = [word.replace("'t",'') for word in sent_new_3] # clean 't
            sent_new_5 = [word for word in sent_new_4 if len(word)>1] # clean empty 'words'
            sentences.append(sent_new_5)
    # Stemming
    #for s in sentences:
    #    stemmed_sentences = []
    #    for word in s:
    #        stemmed_sentences.append(stemmer.stem(word))
    #    stemmed_word_list.append(stemmed_sentences)
    
    #return stemmed_word_list
    return sentences

In [35]:
def loadPairs(path):
    data = pd.read_csv(path, delimiter='\t')
    pairs = zip(data['word1'],data['word2'],data['similarity'])
    return pairs

def drop(u, v):
    return u - v * u.dot(v)/v.dot(u)

**SkipGram model**

Functions - 
1)	Unigram distribution - 
The idea behind unigram distribution was that the probability of picking a word should be equal to the number of times this word appears in the corpus raised to the power ¾ , divided by the total number of words in the corpus also raised to the power ¾ as it is done in the original paper.

My function generates a large table that contains all the words repeated as many times as their distribution. I will then randomly sample the negative words from this unigram table.

2)	Word frequency -
Filtering stopwords, words below a minimum count and words that are not in vocab.

3)	Word to ID mapping -
Preparing Word to ID input mapping vector

4)	Positive words -
Words that actually appear within the context window of the center word. A word vector of a center word will be more similar to a positive word than of randomly drawn negative words because words appearing together have strong correlation. 

5)	Negative sampling
The negative sampling is performed by randomly picking word from the unigram table taking into account that the sampled word is neither part of the context nor the center word with the length of the negative word set predefined as a hyperparameter.


Training - 
Forward Propagation: Computing hidden (projection) layer
Forward Propagation: Sigmoid output layer
Backward Propagation: Prediction Error
Backward Propagation: Computing ∇ Winput
Backward Propagation: Computing ∇ Woutput
Backward Propagation: Updating Weight Matrices

Similarity – 
For the computation of the similarity, I chose the cosine similarity which is defined as the cosine of the angle between two sequences of numbers or the dot product of the vectors divided by the product of their lengths.
Given the fact that we are in a positive space, it returns a number between [0 ; 1] indicating the similarity (the higher the more similar the words are). 

In [70]:
class SkipGram:
    def __init__(self, sentences, nEmbed=100, negativeRate=5, winSize = 5, minCount = 5, epochs=1, lr=0.1):
        self.w2id = {} # word to ID mapping
        self.trainset = sentences # set of sentences
        self.minCount = minCount # Minimum occurrence of words
        words = [word for sentence in sentences for word in sentence] # Store every word appearing in the trainset with repetitions
        word_count = dict(Counter(words)) # Return the frequency of words in the set

        # filter words with minCount
        word_count = {word:freq for word, freq in word_count.items() if freq >= self.minCount}
        words = [word for word, freq in word_count.items()]

        self.vocab = sorted(set(words)) # list of valid words
        vocab_size = len(self.vocab) # Size of the vocabulary
        word_count = dict(Counter(words)) # Return the frequency of words in the set
        
        self.freq = np.array([word_count[word] / vocab_size for word in self.vocab]) # frequency of words, to be used for the unigram
        self.nEmbed = nEmbed
        self.negativeRate = negativeRate
        self.winSize = winSize
        self.epochs = epochs
        self.lr = lr # Learning rate
        self.accLoss = 0
        self.trainWords = 0
        self.loss = [] # list that will store the loss values
        self.loss_epoch = [] # list that will store the loss values of a particular epoch

        # Initialization -- > Try others
        #self.centerV = (np.random.rand(vocab_size, nEmbed) - 0.5) / nEmbed # Initialize random vector for center words
        #self.contxtV = (np.random.rand(vocab_size, nEmbed) - 0.5) / nEmbed # Initialize random vector for context words

        # He Initialisation - np.random.randn(a, b) * np.sqrt(2/b)
        #self.centerV = np.random.rand(vocab_size, nEmbed) * np.sqrt(2/nEmbed) # Initialize random vector for center words
        #self.contxtV = np.random.rand(vocab_size, nEmbed) * np.sqrt(2/nEmbed) # Initialize random vector for context words

        self.centerV = np.random.randn(vocab_size, nEmbed)
        self.contxtV = np.random.randn(vocab_size, nEmbed)
        
        #self.centerV = np.random.randn(vocab_size, nEmbed)
        #self.contxtV = np.random.randn(vocab_size, nEmbed)
        
        ## Prepare w2id: word to index
        for i, word in enumerate(self.vocab):
            self.w2id[word] = i

        # Create the unigram table
        self.unigram_table = self.compute_unigram_table()

        #print("Initalization completed!")

    def compute_unigram_table(self, exponent=0.75, table_length = int(1e8)):
        """ Compute the unigram table """
        # Length of the unigram table is equal to 1e8
        # This large table will contain every words, repeated as many times as their frequency
        # The negative sampling will be done from this table

        vocab_size = len(self.vocab)
        
        # Normalization factor for the word probabilities = denominator
        norm_factor = sum(
            [np.power(self.freq[i], exponent) for i in range(vocab_size)]
        )

        table = np.array(np.zeros(table_length), dtype=int)
        p = 0 # Cumulative probability
        count = 0

        # iterate over every words
        for i in range(vocab_size):
            p += np.power(self.freq[i], exponent) / norm_factor

            while (count < table_length) and (count/ table_length < p):
                table[count]= i
                count +=1

        np.random.shuffle(table)

        return table


    def sample(self, omit):
        #"""samples negative words, ommitting those in set omit"""
        count = 0
        negWordsId = []
        # Sample negative words until we reach negativeRate
        while count < self.negativeRate:
            negWordId = np.random.choice(self.unigram_table)
            if negWordId not in omit: #selecting only words that are not in omit (i.e. not in the current context)
                negWordsId.append(negWordId)
                count +=1
        return negWordsId
        

    def train(self):
        print("Entering the training phase.")
        for epoch in tqdm(range(self.epochs)):
            start_time = datetime.now()
            self.loss_epoch = []
            print("\n epoch: %d of %d" % (epoch + 1, self.epochs))
            for counter, sent in enumerate(self.trainset):
                sentence = [word for word  in sent if word in self.vocab]

                for wpos, word in enumerate(sentence):
                    wIdx = self.w2id[word]
                    winsize = np.random.randint(self.winSize) + 1
                    start = max(0, wpos - winsize)
                    end = min(wpos + winsize + 1, len(sentence))

                    for context_word in sentence[start:end]:
                        ctxtId = self.w2id[context_word]
                        if ctxtId == wIdx: continue
                        negativeIds = self.sample({wIdx, ctxtId})
                        self.trainWord(wIdx, ctxtId, negativeIds)
                        self.trainWords += 1

                if counter % 1000 == 0:
                    end_time = datetime.now()
                    print(' > training %d of %d' % (counter, len(self.trainset)), '- Duration: {}'.format(end_time - start_time))
                
                    self.loss.append(self.accLoss / self.trainWords)
                    self.loss_epoch.append(self.accLoss / self.trainWords)
                    self.trainWords = 0
                    self.accLoss = 0.
            print("Epoch:", epoch + 1, "Loss:", sum(self.loss_epoch))

    def trainWord(self, wordId, contextId, negativeIds):
        vector = self.centerV[wordId]
        ctxtvector = self.contxtV[contextId]
        negvector = self.contxtV[negativeIds]
        z = expit(-np.dot(ctxtvector, vector)) # logistic sigmoid (i.e inverse of the logit function)
        zNeg = - expit(np.dot(negvector, vector))

        ## Compute the gradients
        contxtGrad = z * vector
        centerGrad = z * self.contxtV[contextId] + np.dot(zNeg, negvector)
        negGrad = np.outer(zNeg, vector)

        ## Gradient descent step
        np.add(vector, centerGrad * self.lr, out=vector);
        np.add(ctxtvector, contxtGrad * self.lr, out=ctxtvector);
        np.add(negvector, negGrad * self.lr, out= negvector);

        ## Compute the loss
        z = expit(np.dot(ctxtvector, vector))
        zNeg = expit(-np.dot(negvector, vector))
        self.accLoss -= np.log(z) + np.sum(np.log(zNeg))

        ## Update the embeddings
        self.centerV[wordId] = vector
        self.contxtV[contextId] = contxtGrad
        self.contxtV[negativeIds] = negvector


    def save(self,path):
        """ We will save the model in a .zip file"""
        import os
        from zipfile import ZipFile, ZIP_DEFLATED
        from json import dumps

        if "." in path:
            filename_split = path.split('.')
            if filename_split[-1] != "zip":
                filename_split[-1] = "zip"
                path = ".".join(filename_split)
        else:
            path += "/sg.zip"

        zf = ZipFile(path, mode="w", compression=ZIP_DEFLATED)

        ## Save the parameters of the model
        model_info = dumps(
            {
                "nEmbed": self.nEmbed,
                "negativeRate": self.negativeRate,
                "winSize": self.winSize,
                "minCount": self.minCount,
                "w2id": self.w2id,
                "epochs": self.epochs,
                "lr": self.lr,
            }, indent = 5
        )

        trainset = dumps(self.trainset, indent=5)
        vocab = dumps(self.vocab, indent=5)

        zf.writestr("model_info.json", model_info)
        zf.writestr("trainset.json", trainset)
        zf.writestr("vocab.json", vocab)

        # Save embeddings data
        np.save("centerV.npy", self.centerV)
        zf.write("centerV.npy")
        os.remove("centerV.npy")
        np.save("contxtV.npy", self.contxtV)
        zf.write("contxtV.npy")
        os.remove("contxtV.npy")
        np.save("freq.npy", self.freq)
        zf.write("freq.npy")
        os.remove("freq.npy")
        np.save("loss.npy", self.loss)
        zf.write("loss.npy")
        os.remove("loss.npy")

        zf.close()



    def similarity(self,word1,word2):
        """
        computes similiarity between the two words. unknown words are mapped to one common vector
        :param word1:
        :param word2:
        :return: a float \in [0,1] indicating the similarity (the higher the more similar)
        """

        word1, word2 = word1.lower(), word2.lower()
        if (word1 in self.vocab) and (word2 in self.vocab):
            vec1 = self.centerV[self.w2id[word1]]
            vec2 = self.centerV[self.w2id[word2]]

            # compute the cosine similarity
            cos_similarity = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
            score = np.clip(cos_similarity, 0, 1) # limit the value between 0 and 1
        else:
            score = np.random.rand() # assign a random score to the pair if a word is not in the vocabulary

        if score <= 1e-4:
            score = 0
        return score


    @staticmethod
    def load(path):
        from json import loads
        from io import BytesIO
        from zipfile import ZipFile

        try:
            zf = ZipFile(path, "r")
        except FileNotFoundError:
            path = path.split('.')
            path[-1] = 'zip'
            path = '.'.join(path)
            zf = ZipFile(path, "r")

        model_info = loads(zf.read("model_info.json"))
        trainset = loads(zf.read('trainset.json'))
        vocab = loads(zf.read('vocab.json'))

        sg = SkipGram(trainset, nEmbed=model_info['nEmbed'], negativeRate=model_info['negativeRate'],
        winSize=model_info['winSize'], minCount=model_info['minCount'], epochs=model_info['epochs'],
        lr=model_info['lr'])

        sg.vocab = vocab
        sg.centerV = np.load(BytesIO(zf.read('centerV.npy')))
        sg.contxtV = np.load(BytesIO(zf.read('contxtV.npy')))
        sg.loss = np.load(BytesIO(zf.read('loss.npy'))).tolist()
        sg.freq = np.load(BytesIO(zf.read('freq.npy')))

        zf.close()

        return sg
    
    #### Additional Step : Debias #####
    
    # Step 1: Identify the direction of embedding that captures the gender subspace    
    def doPCA(self, pairs, num_components = 10):
            matrix = []
            for a, b in pairs:
                    if (a in self.vocab) and (b in self.vocab):
                        center = (self.centerV[self.w2id[a]] + self.centerV[self.w2id[b]])/2
                        matrix.append(self.centerV[self.w2id[a]] - center)
                        matrix.append(self.centerV[self.w2id[a]] - center)
                    else: continue
            matrix = np.array(matrix)
            pca = PCA(n_components = num_components)
            pca.fit(matrix)
            return pca
    
    # Step 2: Neutralize - to make sure the gender neutral words are zero in gender subspace
    
    def debias(self, gender_specific_words, definitional):
            gender_direction = self.doPCA(definitional).components_[0]
            specific_set = set(gender_specific_words)
            for i, w in enumerate(self.vocab):
                if w not in specific_set:
                    self.centerV[self.w2id[w]] = drop(self.centerV[self.w2id[w]], gender_direction)
    

In [13]:
f1 = open('C:/Users/ofir6/Desktop/NLP final/definitional_pairs.json')
definitional = json.load(f1)
    
f2 = open('C:/Users/ofir6/Desktop/NLP final/gender_specific_seed.json')
gender_specific_words = json.load(f2)

In [71]:
#path = 'C:/Users/ofir6/Desktop/NLP final/news.en-00024-of-00100'
path = 'C:/Users/ofir6/Desktop/NLP final/train.txt'
sentences = text2sentences(path)

In [72]:
print(sentences[1])

['the', 'minutes', 'could', 'shed', 'light', 'on', 'an', 'internal', 'debate', 'which', 'has', 'been', 'evident', 'in', 'fed', 'officials', 'recent', 'speeches', 'over', 'when', 'to', 'consider', 'raising', 'rates']


In [73]:
path = 'C:/Users/ofir6/Desktop/NLP final/train.txt'
sentences = text2sentences(path)
sg = SkipGram(sentences)
sg.train()

Entering the training phase.


  0%|                                                                                            | 0/1 [00:00<?, ?it/s]


 epoch: 1 of 1
 > training 0 of 168189 - Duration: 0:00:00.027899
 > training 1000 of 168189 - Duration: 0:00:12.238415
 > training 2000 of 168189 - Duration: 0:00:24.174419
 > training 3000 of 168189 - Duration: 0:00:35.722723
 > training 4000 of 168189 - Duration: 0:00:47.174827
 > training 5000 of 168189 - Duration: 0:00:59.236450
 > training 6000 of 168189 - Duration: 0:01:10.502461
 > training 7000 of 168189 - Duration: 0:01:22.257468
 > training 8000 of 168189 - Duration: 0:01:34.131396
 > training 9000 of 168189 - Duration: 0:01:45.599354
 > training 10000 of 168189 - Duration: 0:01:56.950377
 > training 11000 of 168189 - Duration: 0:02:12.482544
 > training 12000 of 168189 - Duration: 0:02:26.412726
 > training 13000 of 168189 - Duration: 0:02:38.273514
 > training 14000 of 168189 - Duration: 0:02:49.756621
 > training 15000 of 168189 - Duration: 0:03:01.321216
 > training 16000 of 168189 - Duration: 0:03:13.076828
 > training 17000 of 168189 - Duration: 0:03:25.488298
 > trai

 > training 149000 of 168189 - Duration: 0:35:46.834915
 > training 150000 of 168189 - Duration: 0:36:06.173924
 > training 151000 of 168189 - Duration: 0:36:21.728134
 > training 152000 of 168189 - Duration: 0:36:34.461524
 > training 153000 of 168189 - Duration: 0:36:47.666666
 > training 154000 of 168189 - Duration: 0:37:00.252333
 > training 155000 of 168189 - Duration: 0:37:13.026131
 > training 156000 of 168189 - Duration: 0:37:25.746162
 > training 157000 of 168189 - Duration: 0:37:38.203582
 > training 158000 of 168189 - Duration: 0:37:50.326465
 > training 159000 of 168189 - Duration: 0:38:02.662557
 > training 160000 of 168189 - Duration: 0:38:15.181530
 > training 161000 of 168189 - Duration: 0:38:27.737948
 > training 162000 of 168189 - Duration: 0:38:40.261831
 > training 163000 of 168189 - Duration: 0:38:52.563651
 > training 164000 of 168189 - Duration: 0:39:05.018595
 > training 165000 of 168189 - Duration: 0:39:18.582812
 > training 166000 of 168189 - Duration: 0:39:31

100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [39:58<00:00, 2398.58s/it]

Epoch: 1 Loss: 85.38261956925695





In [41]:
# Uncomment below line to run the debias before saving the embedding matrix
sg.debias(gender_specific_words, definitional)

In [74]:
path_model = 'C:/Users/ofir6/Desktop/NLP final/'
sg.save(path_model)

In [75]:
df = pd.read_csv('C:/Users/ofir6/Desktop/NLP final/SimLex-999.txt', sep ='\t', engine='python')
pairs = zip(data['word1'],data['word2'])

In [76]:
sg = SkipGram.load('C:/Users/ofir6/Desktop/NLP final/sg.zip')

In [78]:
y_pred = []
for a,b in pairs:
    score = sg.similarity(a,b)
    y_pred.append(score)
    print(a, b, round(score,2))

old new 0.53
smart intelligent 0.1
hard difficult 0.45
happy cheerful 0.12
hard easy 0.44
fast rapid 0.4
happy glad 0.11
short long 0.66
stupid dumb 0.06
weird strange 0
wide narrow 0.21
bad awful 0.2
easy difficult 0.45
bad terrible 0.21
hard simple 0.32
smart dumb 0.13
insane crazy 0.26
happy mad 0.15
large huge 0.38
hard tough 0.45
new fresh 0.39
sharp dull 0.06
quick rapid 0.33
dumb foolish 0.2
wonderful terrific 0.29
strange odd 0.17
happy angry 0.23
narrow broad 0.28
simple easy 0.27
old fresh 0.4
apparent obvious 0.16
inexpensive cheap 0.14
nice generous 0.27
weird normal 0.16
weird odd 0.01
bad immoral 0.19
sad funny 0.17
wonderful great 0.15
guilty ashamed 0.16
beautiful wonderful 0.24
confident sure 0.15
dumb dense 0.18
large big 0.54
nice cruel 0.25
impatient anxious 0.1
big broad 0.34
strong proud 0.36
unnecessary necessary 0.19
restless young 0.16
dumb intelligent 0.12
bad great 0.41
difficult simple 0.41
necessary important 0.43
bad terrific 0.12
mad glad 0.01
honest guil

accident catastrophe 0.14
journey trip 0.39
activity movement 0.45
gossip news 0.05
father god 0.32
action course 0.5
fever illness 0.03
aviation flight 0.28
game action 0.42
molecule air 0.19
home state 0.48
word literature 0.18
adult guardian 0.17
newspaper information 0.57
communication television 0.22
cousin uncle 0.22
author reader 0.27
guy partner 0.33
area corner 0.43
ballad song 0.1
wall decoration 0.1
word page 0.42
nurse scientist 0.07
politician president 0.31
president mayor 0.24
book essay 0.09
man warrior 0.15
article journal 0.45
breakfast supper 0.11
crowd parade 0.23
aisle hallway 0.11
teacher rabbi 0.1
hip lip 0.11
book article 0.41
room cell 0.39
box booth 0.28
daughter kid 0.34
limb leg 0.2
liver lung 0.07
classroom hallway 0.14
mountain ledge 0.14
car elevator 0.11
bed couch 0.29
clothes button 0.01
clothes coat 0.13
kidney organ 0.17
apple sauce 0.07
chicken steak 0.09
car hose 0.1
tobacco cigarette 0.11
student professor 0.55
baby daughter 0.45
pipe cigar 0.29
mi

In [None]:
if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--text', help='path containing training data', required=True)
    parser.add_argument('--model', help='path to store/read model (when training/testing)', required=True)
    parser.add_argument('--test', help='enters test mode', action='store_true')
    opts = parser.parse_args()
    
    
    ## Please change the below file location to run the debias 
    f1 = open('C:\\NLP\\definitional_pairs.json')
    definitional = json.load(f1)
    
    f2 = open('C:\\NLP\\gender_specific_seed.json')
    gender_specific_words = json.load(f2)
    
    

    if not opts.test:
        sentences = text2sentences(opts.text)
        sg = SkipGram(sentences)
        sg.train()
        
        # Uncomment below line to run the debias before saving the embedding matrix
        sg.debias(gender_specific_words, definitional)
        
        sg.save(opts.model)

    else:
        pairs = loadPairs(opts.text)

        sg = SkipGram.load(opts.model)
        
        df = pd.read_csv(opts.text, sep ='\t', engine='python')
        
        y_pred = []
        y_test = df['similarity']

        for a,b,_ in pairs:
            # make sure this does not raise any exception, even if a or b are not in sg.vocab
            score = sg.similarity(a,b)
            y_pred.append(score)
            print(score)
            
        print(np.corrcoef(y_pred,y_test)[0][1])
