## Initial Stage
### Importing Libraries and setting constraint values

In [0]:
import numpy as np
import pandas as pd
import math
import re
import json
import pprint
import sys
from operator import itemgetter
from copy import deepcopy
import random
from collections import defaultdict

#constants
MIN_PROB = 1.0e-12

#input
base = "/content/drive/My Drive/"
lang_a = "en" 
infile_a = base + "e.txt"
lang_b = "du"
infile_b = base + "d.txt"

### Loading Data saved in Google Drive

In [7]:
from google.colab import drive
drive.mount('/content/drive')

### Loading Data and Preprocessing
### The corpus data is loaded. It is preprocessed by removing punctuations and normalizing the sentences. The list of pair of Dutch and English sentences is made and a sample from the list is returned as the output.

In [0]:
#generating corpus which is a vector of dictionaries where each dictionary has a sentence from src language 
#and its corresponding translation in trg (target) language 

def get_corpus(lang_a, infile_a, lang_b, infile_b, sentence_size = None):
    '''
    Load corpus from input file infile_a and infile_b
    '''
    corpus = []
    with open(infile_a, 'r', encoding="utf8") as a, open(infile_b, 'r', encoding="utf8") as b:
            while True:
                try:
                    a_sentence = (next(a)).lower()
                    b_sentence = (next(b)).lower()
                    if(sentence_size is not None and (len(a_sentence) > sentence_size or len(b_sentence) > sentence_size)):
                      continue
                    corpus.append({ 
                        lang_a : a_sentence.rstrip(),
                        lang_b : b_sentence.rstrip()
                        })
                except StopIteration:
                    break
    return corpus

#removes punctuations from the given sentence

def to_normalized_sentence(sentence):
  return re.sub(r"[^\w\d'\s]+",'', sentence)

#preprocess corpus

def preprocess_corpus(corpus):
  p_corpus = []
  for pair in corpus:
    a_sen = to_normalized_sentence(pair[lang_a])
    b_sen = to_normalized_sentence(pair[lang_b])
    if(a_sen == '' or b_sen == ''):
      continue
    p_pair = {lang_a : a_sen,
              lang_b : b_sen
              }
    p_corpus.append(p_pair)
  return p_corpus

#generates random sample from the passed corpus

def get_sample(input, size, seed):
  random.seed(seed)
  return random.sample(input, size)

### Function call for preprocessing the loaded corpus

In [6]:
corpus_original = get_corpus(lang_a, infile_a, lang_b, infile_b)
##print(len(corpus_original))
corpus = get_sample(corpus_original, 2, 2)
##print(len(corpus))
p_corpus = preprocess_corpus(corpus)

## Training Phase

### Initialising vocabulary for source and target language, and setting initial uniform probabilities for target vocabulary.

In [0]:
#initialises the vocabulary sets for src and trg language

def init_vocab(corpus, src_vocab, trg_vocab):
        for sentence in corpus:
            trg_vocab.update(sentence[lang_a].split())
            src_vocab.update(sentence[lang_b].split())
        # Add the NULL token
        src_vocab.add(None)
        src_vocab.add(None)

#intialises the translation table probabilties and sets it to 1/(no. of unique words in trg language vocabulary)

def set_initial_probabilities(corpus, translation_table):
        src_vocab = set()
        trg_vocab = set()
        init_vocab(corpus, src_vocab, trg_vocab)
        initial_prob = 1 / len(trg_vocab)

        for t in trg_vocab:
            translation_table[t] = defaultdict(lambda: initial_prob)

### Implementing the EM algorithm and defining training function which returns the translation table.

In [0]:
#EM maximization step - updates probabilities with maximum likelihood estimate

def max_lex_transl_probab(counts, translation_table):
      for t, src_words in counts["t_given_s"].items():
          for s in src_words:
              estimate = counts["t_given_s"][t][s] / counts["any_t_given_s"][s]
              translation_table[t][s] = max(estimate, MIN_PROB)


def maximize_lexical_translation_probabilities_innovate(counts, translation_table, n, corpus):
      src_vocab = set()
      for sentence in corpus:
        src_vocab.update(sentence[lang_b].split())
      for t, src_words in counts["t_given_s"].items():
          for s in src_words:
              estimate = (counts["t_given_s"][t][s] +n)/ (counts["any_t_given_s"][s]+len(src_vocab)*n)
              translation_table[t][s] = max(estimate, MIN_PROB)              

#recomputes translation probababilites i.e. probability of translation of a word in src vocabulary into every word in trg vocabulary 

def train_iter_helper(corpus, translation_table):
        counts = {}
        counts["t_given_s"] = defaultdict(lambda: defaultdict(lambda: 0.0))
        counts["any_t_given_s"] = defaultdict(lambda: 0.0)
        for aligned_sentence in corpus:
            trg_sentence = (aligned_sentence[lang_a]).split()
            src_sentence = (aligned_sentence[lang_b]).split()

            # E step - compute normalization factors to weigh counts
            total_count = get_total_count(src_sentence, trg_sentence, translation_table)
            # E step - compute counts
            for t in trg_sentence:
                for s in src_sentence:
                    count = prob_alignment_point(s, t, translation_table)
                    normalized_count = count / total_count[t]
                    counts["t_given_s"][t][s] += normalized_count
                    counts["any_t_given_s"][s] += normalized_count

        # M step: Update probabilities with maximum likelihood estimate
        max_lex_transl_probab(counts, translation_table)
def generate_models(corpus, seeds, sample_size, iterations):
  models = []
  for s in seeds:
    random.seed(s)
    sample_corpus = random.sample(corpus, sample_size)
    model = train(sample_corpus, iterations)
    models.append(model)
    print("Done for seed " + str(s))
  return models

def ensemble(models):
  translations = []
  keys = set()
  for model in models:
    keys = keys.union(set(model.keys()))
    # print(set(translation.keys()))
    # print(keys)
  ensemble_translations = {}

  for key in keys:
    temp = {} # keys are foreign words and values are their prob.
    count = {}
    for model in models:
      try:
        dict_val = model[key]
        for k in dict_val.keys():
          if k in temp: 
            temp[k] = (dict_val[k] + temp[val] * count[k])/(count[k] + 1)
            count[k] = count[k] + 1
          else:
            temp[k] = dict_val[k]
            count[k] = 1
      except:
        a = 0
    # print(temp)
    ensemble_translations[key] = max(temp.items(), key=operator.itemgetter(1))[0] 
  return ensemble_translations


#trains the model on the given training corpus for the given number of iterations

def train(corpus, iterations, dump_filename = None):
  translation_table = {}
  set_initial_probabilities(corpus, translation_table)
  for i in range(iterations):
    train_iter_helper(corpus, translation_table)
    print(str(i) + " iterations completed")
    if (i % 10 and dump_filename is not None):
      dump_model(translation_table, filename = dump_filename + str(i) + ".json")
  return translation_table


### Defining functions that compute the probability of all possible word alignments, expressed as a marginal distribution over target words t in a target sentence.

In [0]:
#returns probability of src language word 's' being translated to trg language word 't'

def prob_alignment_point(source, target, translation_table):
        return translation_table[target][source]

#computes the sum of probability of all possible alignments, for the translation of src sentence into trg sentence

def get_total_count(src_sentence, trg_sentence, translation_table):
        
        alignment_prob_for_t = defaultdict(lambda: 0.0)
        for target in trg_sentence:
            for source in src_sentence:
                alignment_prob_for_t[target] += prob_alignment_point(source, target, translation_table)
        return alignment_prob_for_t


## Testing Phase
### Function for translating test sentence based on the trained model which contains word translations.

In [0]:
#computes TF-IDF weights for words in src and trg language documents 
def get_doc_vectors(docA, docB, idf):

  def computeTF(wordDict, bow):
      tfDict = {}
      bowCount = len(bow)
      for word, count in wordDict.items():
          if(count==0):
            tfDict[word] = 0
          else:
            tfDict[word] = 1+math.log10(count)
      return tfDict

  def computeTFIDF(tfBow, idf):
      tfidf = {}
      for word, val in tfBow.items():
          tfidf[word] = val*idf[word]
      return tfidf

  bowA = docA.split()
  bowB = docB.split()
  wordSet = set(bowA).union(set(bowB))
  wordDictA = dict.fromkeys(wordSet, 0) 
  wordDictB = dict.fromkeys(wordSet, 0)

  for word in bowA:
    wordDictA[word]+=1
  for word in bowB:
    wordDictB[word]+=1

  tfBowA = computeTF(wordDictA, bowA)
  tfBowB = computeTF(wordDictB, bowB)
  tfidfBowA = computeTFIDF(tfBowA, idf)
  tfidfBowB = computeTFIDF(tfBowB, idf)

  df = pd.DataFrame([tfidfBowA, tfidfBowB])
  return df.values[0], df.values[1]

#computes cosine similarity

def cosine_similarity(idf, docA, docB):
  try:
    a, b = get_doc_vectors(docA, docB, idf)
    dot_product = np.dot(a,b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product/(norm_a*norm_b)
  except:
    return -1


#computes IDF values corresonding to all the unique words in every document in docList

def compute_IDFs(corpus):

  def compute_IDF(docList):
      idfDict = {}
      word_set = set()
      for sent in docList:
        word_set = word_set.union(set(sent))
      N = len(docList)
      for word in word_set:
          val = 0
          for doc in docList:
            if word in doc:
              val = val+1
          idfDict[word] = math.log10(N / float(val))
      return idfDict

  idfs = {}
  docListA = []
  docListB = []
  for pair in corpus:
    docListA.append(pair[lang_a].split())
    docListB.append(pair[lang_b].split())
  idfs[lang_a] = compute_IDF(docListA)
  idfs[lang_b] = compute_IDF(docListB)
  return idfs



### Utility Functions for computing Jaccard Coefficient and Cosine Similarity and would be used for evaluating the results

In [None]:
#computes jaccard coefficient

def jaccard_similarity(docA, docB):
  list1 = docA.split()
  list2 = docB.split()
  intersection = len(list(set(list1).intersection(list2)))
  union = (len(list1) + len(list2)) - intersection
  return float(intersection) / union

#returns a dictionary of cosine similarity and jaccard coefficient

def similarity(idf, docA, docB):
  return {
      "cosine" : cosine_similarity(idf, docA, docB),
      "jaccard" : jaccard_similarity(docA, docB)
  }

### Functions for getting the translated sentences and the corresponding performance score

In [2]:
#returns dictionary which contains word translations from src language to trg language

def get_translations(translation_table):
  translations = {}
  for t in translation_table.keys():
    translations[t] = max(translation_table[t].items(), key=lambda a: a[1])[0]
  return translations

#accepts a sentence and returns its translation into trg language

def translate_sentence(model, sentence):
    def tokenize(sentence):
      return sentence.split()

    def translate(tokens, translations):
        return [translations[word] if word in translations else word for word in tokens]

    translations = get_translations(model)
    tokens = tokenize(sentence)
    translated_tokens = translate(tokens, translations)

    return " ".join(translated_tokens)

## Result Phase
### Calling Function for Training

In [8]:
model = train(p_corpus[:1000], 2)
idfs = compute_IDFs(p_corpus[:1000])

### Saving the generated model as a json file

In [0]:
def dump_model(model, filename, idfs = None):
  data = {"model" : model, 
          "idfs" : idfs}
  with open(filename, "w+") as f:
    json.dump(data, f)

### Computing and printing Average Cosine Similarity and Jaccard Coefficient for all the test cases.

In [0]:
cos_sim = 0
jac_sim = 0
for i in range(len(p_corpus)):
  docA = p_corpus[i][lang_b]
  docB = translate_sentence(model, p_corpus[i][lang_a])
  sim = similarity(idfs[lang_b], docA, docB)
  jac_sim += sim["jaccard"]
  cos_sim += sim["cosine"]
jac_sim /= len(p_corpus)
cos_sim /= len(p_corpus)
print("Jaccard: " + str(jac_sim) + " Cosine: " + str(cos_sim))

## Ensemble Learning for the Innovation Part
### Implementing Ensemble Learning

In [0]:
import operator

# generating sets of model for ensemble learning
def generate_models(corpus, seeds, sample_size, iterations):
  models = []
  for s in seeds:
    random.seed(s)
    sample_corpus = random.sample(corpus, sample_size)
    model = train(sample_corpus, iterations)
    models.append(model)
    print("Done for seed " + str(s))
  return models

# get translation probabilites from the ensembler
def ensemble(models):
  translations = []
  keys = set()
  for model in models:
    keys = keys.union(set(model.keys()))
  ensemble_translations = {}

  for key in keys:
    temp = {} # keys are foreign words and values are their prob.
    count = {}
    for model in models:
      try:
        dict_val = model[key]
        for k in dict_val.keys():
          if k in temp: 
            temp[k] = (dict_val[k] + temp[val] * count[k])/(count[k] + 1)
            count[k] = count[k] + 1
          else:
            temp[k] = dict_val[k]
            count[k] = 1
      except:
        a = 0
    ensemble_translations[key] = max(temp.items(), key=operator.itemgetter(1))[0] 
  return ensemble_translations


In [9]:
ensemble_models = generate_models(p_corpus, [1, 2, 3, 4], 1000, 10)

In [10]:
ensemble_translations = ensemble(ensemble_models)
print(ensemble_translations)

## Displaying the final Results Obtained

In [0]:
def read_output_json(filename):
  data = None
  with open(filename, "r") as f:
    data = json.load(f)
  return data

def show_results(input_json):
  print(input_json["title"])
  print("Corpus Length: " + str(input_json["corpus_size"]) )
  pr_corpus = preprocess_corpus(input_json["corpus"])
  print("Similarity Scores: ")
  print("Cosine Similarity: " + input_json["score"]["cosine"])
  print("Jaccard Coefficient: " + input_json["score"]["jaccard"])