<a href="https://colab.research.google.com/github/SpellOnYou/CLab21/blob/main/midterm/word2vec_window_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pathlib import Path

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [12]:
!cp -R /gdrive/MyDrive/2021/TL/2021-05-24/extracted/AA/. .

In [32]:
!head -n10 /content/processed/wiki_00

Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be undesirable, unnecessary, and harmful. As a historically far-left movement, it is usually described alongside libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement and has a strong historical association with anti-capitalism and socialism.
The history of anarchy goes back to prehistory, when humans arguably lived in anarchic societies long before the establishment of formal states, realms or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose, but it was not until the 19th century that a self-conscious political movement emerged. During the latter half of the 19th and the first decades of the 20th century, the anarchist movement flourished in most parts of the world and had a significant role in worke

In [2]:
src_path = Path('/content')
trg_path = Path('/content/processed')
trg_path.mkdir(parents = True, exist_ok = True)

In [27]:
def get_wiki(fname):
    if fname in list(i.name for i in src_path.iterdir()):
        all_sents = []
        with (trg_path/fname).open('w') as wf:
            with (src_path/fname).open() as rf:
                all_sents += [doc for doc in rf.read().split('\n\n') if doc if not doc.startswith('<doc') if not doc.startswith('</doc')]
            print('\n'.join(all_sents), file = wf)
        print(f"{fname} written done to {trg_path/fname}")

In [28]:
get_wiki('wiki_00')

wiki_00 written done to /content/processed/wiki_00


In [3]:
trg_path

PosixPath('/content/processed')

In [5]:
def get_data(wikiname):
    with (trg_path/wikiname).open('r') as f:
        ls = f.readlines()
    return ls

In [6]:
wiki_00=get_data('wiki_00')
len(wiki_00)

2751282

In [8]:
wiki_00_split = wiki_00[:int(len(wiki_00)*0.5)]

In [9]:
len(wiki_00_split)

1375641

# Vectorizer

In [10]:
class Vocabulary:
    def __init__(self, corpus):
        '''
        Args:
            corpus: List(str)
        '''    
        self.corpus = corpus
        self.sents = [list(map(self._cleansing_token, doc.split(' '))) for doc in corpus]
        self.vocab = list({token for sentence in self.sents for token in sentence}) + ['UNK']
        self._token_to_idx()
        self._idx_to_token()        

    def _cleansing_token(self, token):
        return token.strip("\"\'\.\'\(\)\-").lower()
    def _token_to_idx(self):
        self._token_to_idx = {token:idx for idx, token in enumerate(self.vocab)}
    
    def _idx_to_token(self):
        self._idx_to_token = {idx:token for idx, token in enumerate(self.vocab)}

    def lookup_token(self, token):
        if token in self._token_to_idx:
            return self._token_to_idx[token]
        else:
            return self._token_to_idx['UNK']
        
    def lookup_idx(self, idx):
        """
        Args:
            idx: int
        """
        # assert type(idx) == int
        return self._idx_to_token[idx]

In [38]:
vocabulary = Vocabulary(wiki_00)

In [39]:
len(vocabulary.vocab)

2896376

In [40]:
vocabulary.lookup_token('add')

1160644

In [11]:
from itertools import cycle

In [12]:
class Vectorizer(Vocabulary):
    """vectorizer which converts vocabulary tokens in sentence to word pairs given window size"""
    def __init__(self, *args):
        #inherit all method / attribute from vocab
        super().__init__(*args)
        # self.__call__(self)
        self.pairs = []

    def __call__(self, wd_size=2):
        for sent in self.sents:
            for idx, token in enumerate(sent):
                # print(sent)
                context_words = sent[(idx-wd_size):idx]+ sent[(idx+1): (idx+wd_size+1)]
                # print(sent, '\n', context_words, token); break
                context_indices = map(self.lookup_token, context_words)
                # print()
                self.pairs += [*zip(cycle([self.lookup_token(token)]), context_indices)]

In [None]:
vect = Vectorizer(wiki_00_split)
vect(2)

In [None]:
del wiki_00_split
gc.collect()

In [None]:
len(vect.sents)

In [None]:
len(vect.pairs)

# Train embedding model

In [None]:
class Linear():
    def __init__(self, w, b):
        self.w, self.b = w, b
    def forward(self, x): 
        self.inp = x
        self.out = self.inp@self.w + self.b
        return self.out
    
    def backward(self):
        self.inp.g = self.out.g @ self.w.t()
        self.w.g = (self.inp.unsqueeze(-1) * self.out.g.unsqueeze(1)).sum(0)
        self.b.g = self.out.g.sum(0)

class Relu():
    def forward(self, x):
        self.inp = x
        self.out = x.clamp_min(0.) - 0.5
        return self.out

    def backward(self):
        self.inp.g = self.out.g* (self.inp>0).float()        

class Softmax():
    def forward(self, x):
        self.inp = x
        self.out = (x.exp()/ x.exp().sum(-1, keepdim=True))
        return self.out

    def backward(self):
        self.inp.g = (1-self.out.g)*self.out.g

In [None]:
class CrossEntropy():
    def __call__(self, pred, y):
        
        self.yhat, self.y = pred, y
        #P(\hat{y})
        self.log_p_yhat = self.log_softmax(pred)
        self.out = self.nll(self.log_p_yhat, y)
        
        return self.out

    #negative log likelihood
    def nll(self, pred, y):
        # print(pred.shape, y.shape)
        return -pred[range(y.shape[0]), y.max(-1).indices].mean()

    def log_softmax(self, x): return x - x.exp().sum(-1,keepdim=True).log()

    def backward(self, inp):
        # set_trace()
        self.yhat.g = (inp.unsqueeze(1)*(self.yhat - self.y).unsqueeze(-1)).sum(-1)
        # self.yhat.g = (self.yhat.exp()/(self.yhat.exp().sum(-1,keepdim=True))) - self.y

class DummyModel():
    def __init__(self, w1, b1, w2, b2):
        self.loss = CrossEntropy()
        self.layers = [Linear(w1,b1), Relu(), Linear(w2, b2)]
    
    def forward(self, x):
        self.x= x

        for layer in self.layers:
            x = layer.forward(x)
        self.out = x
        return self.out

    def backward(self, x2):
        self.loss.backward(x2)
        for layer in reversed(self.layers):
            layer.backward()

In [None]:
n, m, h, c = *train_x.shape, 100, train_y.shape[1]

w1 = randn(m, h) / math.sqrt(h)
w2 = randn(h, c)
b1 = randn(h)
b2 = randn(c)

In [None]:
model = DummyModel(w1, b1, w2, b2)

def train(epochs, bs, lr):
    for e in range(epochs):
        for bs_i in range((n-1)//bs + 1):
            tot_w_mean, tot_w_std = 0, 0
            str_idx, end_idx = bs_i*bs, (bs_i+1)*bs
            x_batch, y_batch = train_x[str_idx:end_idx], train_y[str_idx:end_idx]
            prediction = model.forward(x_batch)
            loss = model.loss(prediction, y_batch)

            model.backward(model.layers[-1].inp)
            
            with no_grad():
                for layer in model.layers:
                    if hasattr(layer, 'w'): #if they have parameter attribute
                        tot_w_mean+= layer.w.g.mean()
                        tot_w_std += layer.w.g.std()
                        layer.w -= layer.w.g * lr
                        layer.b -= layer.b.g * lr
                        layer.w.g.zero_() #initialize them to zero
                        layer.b.g.zero_()
            if bs_i % 20 ==0: print(tot_w_mean/bs, tot_w_std/bs)

## Next: using vect.pairs,
- make one hot encoding with left side integer (X data)
- save right side integer to Y data