In [None]:
!pip install stanza
import stanza
import codecs
import random
import pickle
import math
import numpy

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
HindiTextTotal = codecs.open("/content/drive/My Drive/SMDM/hi_wiki.txt",'r','utf-8').read().split('\n')
HindiTextNonEmpty = list(filter(None, HindiTextTotal))
print("Hindi :",len(HindiTextNonEmpty))

Hindi : 115378


In [None]:
stanza.download('hi')
nlpHi = stanza.Pipeline(lang='hi', processors='tokenize')
lineNo = 1;
Sentences = []
for line in HindiTextNonEmpty:
  doc = nlpHi(line)
  for sent in doc.sentences:
    tokens = []
    for token in sent.tokens:
      tokens.append(token.text)
    Sentences.append(tokens)
  if (lineNo % 1000 == 0):
    print(lineNo)
  lineNo = lineNo + 1

In [None]:
random.shuffle(Sentences)
print(len(Sentences))

385970


In [None]:
print("Training Set Size: ", len(Sentences))
with open("/content/drive/My Drive/SMDM/TotalTrainSet.txt", "wb") as fp:  
  pickle.dump(Sentences, fp)

Training Set Size:  385970


In [None]:
with open("/content/drive/My Drive/SMDM/TotalTrainSet.txt", "rb") as fp:
  TrainSet = pickle.load(fp)
print(len(TrainSet))

385970


In [None]:
windowSize = 5
def GetCoocurrenceMatrixAndVocabulary(Set):
  cooccur = {}
  vocabulary = []

  count = 0

  # Iterate over all the sentences
  for sent in Set:
    count += 1
    if count%10000==0:
      print("Reached Sentence:",count)
    for i in range(len(sent)):
      tok = sent[i]
      # If token not in the vocabulary, then add it
      if tok not in vocabulary:
        vocabulary.append(tok)
      
      jmin = max(0,i-windowSize)
      jmax = min(len(sent),i+windowSize+1)

      for j in range(jmin,jmax):
        if i == j:
          continue
        context = sent[j]
        val = (tok,context)
        if val not in cooccur:
          cooccur[val] = 0
        # Increment the count of the coocurrence for the val
        cooccur[val] = cooccur[val]+1
    
  return (cooccur,vocabulary)

In [None]:
res = GetCoocurrenceMatrixAndVocabulary(TrainSet)
cooccur = res[0]
vocabulary = res[1]

print("Size of the vocabulary:",len(vocabulary))
print("Size of the cooccurrenceMatrix:",len(cooccur))

with open("/content/drive/My Drive/SMDM/VocabularyGloVe.txt", "wb") as fp:
  pickle.dump(vocabulary, fp)

with open("/content/drive/My Drive/SMDM/CoocurranceMatrix.txt", "wb") as fp:
  pickle.dump(cooccur, fp)

Reached Sentence: 10000
Reached Sentence: 20000
Reached Sentence: 30000
Reached Sentence: 40000
Reached Sentence: 50000
Reached Sentence: 60000
Reached Sentence: 70000
Reached Sentence: 80000
Reached Sentence: 90000
Reached Sentence: 100000
Reached Sentence: 110000
Reached Sentence: 120000
Reached Sentence: 130000
Reached Sentence: 140000
Reached Sentence: 150000
Reached Sentence: 160000
Reached Sentence: 170000
Reached Sentence: 180000
Reached Sentence: 190000
Reached Sentence: 200000
Reached Sentence: 210000
Reached Sentence: 220000
Reached Sentence: 230000
Reached Sentence: 240000
Reached Sentence: 250000
Reached Sentence: 260000
Reached Sentence: 270000
Reached Sentence: 280000
Reached Sentence: 290000
Reached Sentence: 300000
Reached Sentence: 310000
Reached Sentence: 320000
Reached Sentence: 330000
Reached Sentence: 340000
Reached Sentence: 350000
Reached Sentence: 360000
Reached Sentence: 370000
Reached Sentence: 380000
Size of the vocabulary: 336077
Size of the cooccurrenceMatr

In [None]:
with open("/content/drive/My Drive/SMDM/VocabularyGloVe.txt", "rb") as fp:
  vocabulary = pickle.load(fp)

with open("/content/drive/My Drive/SMDM/CoocurranceMatrix.txt", "rb") as fp:
  cooccur = pickle.load(fp)

In [None]:
alpha = 0.75
xmax = 100
vectorSize = 100

def f(x):
  if x < xmax:
    val = x/xmax
    ans = math.pow(val,alpha)
    return ans
  else:
    return 1

In [None]:
# Initialize 
biasWord = {}
biasContext = {}
embeddingWord = {}
embeddingContext = {}

def InitializeVectors(vocabulary):
  embeddingWord.clear()
  embeddingContext.clear()
  biasContext.clear()
  biasWord.clear()

  for tok in vocabulary:
    embeddingWord[tok] = numpy.random.random(vectorSize)
    embeddingContext[tok] = numpy.random.random(vectorSize)
    biasWord[tok] = random.random()
    biasContext[tok] = random.random()

In [None]:
numberOfIterations = 50

def trainModel():
  learningRate = 0.0005
  # Initialize the vocabulary
  InitializeVectors(vocabulary)
  print("Learning Started")
  prevCost = math.inf
  # run gradient descent for numberOfIterations
  for iterationCount in range(numberOfIterations):
    # Initialize cost
    cost = 0
    # Iterate over the cooccurance matrix
    for val in cooccur:
      token = val[0]
      context = val[1]
      count = cooccur[val]

      if token == context:
        continue
      # Find the function value
      fval = f(count)
      # Intermediate Loss is the second term in the loss function
      intermediateLoss = numpy.dot(embeddingWord[token],embeddingContext[context]) + biasWord[token] + biasContext[context] - math.log(count)
      # print(token,context,numpy.dot(embeddingWord[token],embeddingContext[context]),intermediateLoss)
      # Update the cost
      cost += math.pow(intermediateLoss,2) * fval

      # Find the gradients
      gradientEmbeddingWord = 2 * fval * intermediateLoss * embeddingContext[context]
      gradientEmbeddingContext = 2 * fval * intermediateLoss * embeddingWord[token]
      gradientBias = 2 * fval * intermediateLoss

      # Update the vectors
      embeddingWord[token] -= learningRate * gradientEmbeddingWord
      embeddingContext[token] -= learningRate * gradientEmbeddingContext
      # Update the constants
      biasWord[token] -= learningRate * gradientBias
      biasContext[token] -= learningRate * gradientBias

    print("Iteration",iterationCount+1,"Complete. Cost after it:-", cost)

    # If cost has increased then the learning rate must be reduced
    if cost > prevCost:
      learningRate= learningRate/2
    prevCost = cost

  print("Training Complete.")

In [None]:
trainModel()
with open("/content/drive/My Drive/SMDM/EmbeddingWord.txt", "wb") as fp:
  pickle.dump(embeddingWord, fp)

Learning Started
Iteration 1 Complete. Cost after it:- 123939985.77611367
Iteration 2 Complete. Cost after it:- 117903460.77235915
Iteration 3 Complete. Cost after it:- 105684728.82918566
Iteration 4 Complete. Cost after it:- 91719616.09895669
Iteration 5 Complete. Cost after it:- 83711414.45911822
Iteration 6 Complete. Cost after it:- 79387158.17370589
Iteration 7 Complete. Cost after it:- 76942769.62194523
Iteration 8 Complete. Cost after it:- 75488090.53121583
Iteration 9 Complete. Cost after it:- 74596717.45495622
Iteration 10 Complete. Cost after it:- 74050372.12846915
Iteration 11 Complete. Cost after it:- 73725273.9901387
Iteration 12 Complete. Cost after it:- 73543105.15423363
Iteration 13 Complete. Cost after it:- 73449369.04362631
Iteration 14 Complete. Cost after it:- 73403428.99103324
Iteration 15 Complete. Cost after it:- 73373659.24956337
Iteration 16 Complete. Cost after it:- 73334906.14392161
Iteration 17 Complete. Cost after it:- 73267030.23763233
Iteration 18 Complete

In [None]:
with open("/content/drive/My Drive/SMDM/EmbeddingWord.txt", "rb") as fp:
  embeddingWord = pickle.load(fp)

In [None]:
totOutput = ""
for val in embeddingWord.keys():
  outputLine = val
  outputLine += " "
  outputLine += " ".join(map(str,embeddingWord[val]))
  totOutput += outputLine + "\n"

In [None]:
file1 = open("/content/drive/My Drive/gloveHindi.txt","w")
file1.write(totOutput)
file1.close()