In [38]:
###Mounting on drive

!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [0]:
###Reading english and german file 

with open("/content/drive/My Drive/de-en.en", "r") as file:
  Eng_Ger_lang = file.read()
  
with open("/content/drive/My Drive/de-en.de", "r") as file:
  Ger_Eng_lang = file.read()

In [0]:
#Importing dependencies

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import RegexpTokenizer
from string import punctuation

def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

##Preprocessing function  
def preprocessing_word(file):
    file=file.lower()
    file_punc=''
    for string in file:
      for char in string:
        if char not in punctuation:
          file_punc += char
        else:
          file_punc += ''
    file_punc = file_punc.replace('\n',' ')
    return file_punc  

def preprocessing(file):
    file=file.lower()
    file_punct = strip_punctuation(file)
    return file_punct

##Preprocessing the english and german corpus
Pre_Eng_Ger_Sent=preprocessing(Eng_Ger_lang)
Pre_Ger_Eng_Sent=preprocessing(Ger_Eng_lang)



In [0]:
###Splitting the data by sentence
Pre_Eng_Ger_Sent = Pre_Eng_Ger_Sent.split('\n')
Pre_Ger_Eng_Sent = Pre_Ger_Eng_Sent.split('\n')



In [42]:
###Checking the splitted data by slicing it
print(Pre_Eng_Ger_Sent[:5])

print(Pre_Ger_Eng_Sent[:5])


['resumption of the session', 'i declare resumed the session of the european parliament adjourned on friday 17 december 1999  and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period ', 'although  as you will have seen  the dreaded  millennium bug  failed to materialise  still the people in a number of countries suffered a series of natural disasters that truly were dreadful ', 'you have requested a debate on this subject in the course of the next few days  during this partsession ', 'in the meantime  i should like to observe a minute  s silence  as a number of members have requested  on behalf of all the victims concerned  particularly those of the terrible storms  in the various countries of the european union ']
['wiederaufnahme der sitzungsperiode', 'ich erkläre die am freitag  dem 17 dezember unterbrochene sitzungsperiode des europäischen parlaments für wiederaufgenommen  wünsche ihnen nochmals alles gute zum jahreswechsel und

In [0]:
###Splitting the data by words

eng_ger_words = []
for sentence in Pre_Eng_Ger_Sent:
  for word in sentence.split():
    eng_ger_words.append(word)

ger_eng_words = []
for sentence in Pre_Ger_Eng_Sent:
  for word in sentence.split():
    ger_eng_words.append(word)

In [53]:
####Declaring the vocabulary and taking the unique words in to count
Voc_english_german=set(eng_ger_words)
Unique_english_german = len(Voc_english_german)

Voc_german_english=set(ger_eng_words)
Unique_german_english = len(Voc_german_english)


##Initializing the probabilities of english in english vs german data
english_translation_probability_2 = {}


  
for e in Voc_english_german:
  german_translation_probability = {}
  for g in Voc_german_english:
    german_translation_probability[g]=(1/Unique_english_german)
  english_translation_probability_2[e]=german_translation_probability



english_translation_probability_2['resumption']['wiederaufnahme']

0.0002945508100147275

In [0]:
####main
import time 
from copy import deepcopy

## setting convergence point    
convergence_point = 0.0021  

##taking sentence pairing and initialisng sum as 0
def sentence_pair(counts,total_german,prob):
  for es,gs in zip(Pre_Eng_Ger_Sent, Pre_Ger_Eng_Sent):
    eng_word=es.split()
    ger_word=gs.split()
    sum = {}
    for ew in eng_word:
      sum[ew] = 0
      for gw in ger_word:
        sum[ew] += prob[ew][gw]
#       print(sum[ew])

##setting count of aligned text probabilities. 
    for ew in eng_word:
      for gw in ger_word:
        counts[ew][gw] += prob[ew][gw]/sum[ew]                                       
        total_german[gw] += prob[ew][gw]/sum[ew]
  return (total_german, counts)
        
def update_trans_prob(counts,total_german,prob):

##updating the probabilities
  for gw in Voc_german_english:
    for ew in Voc_english_german:
      prob[ew][gw] = counts[ew][gw]/total_german[gw]
  return prob 

##calculating the error, separating the two corpus
def error_cal(prob,last_trans_prob):
  sum_of_difference_in_prob = 0
  for(ew,gw) in zip(Voc_english_german,Voc_german_english):
    sum_of_difference_in_prob += (prob[ew][gw] - last_trans_prob[ew][gw])**2
  sum_of_difference_in_prob = sum_of_difference_in_prob**0.5
  return sum_of_difference_in_prob  


def init_total_german():
  total_german = {}
  for gw in Voc_german_english:
    total_german[gw] = 0
  return total_german


## initialize count

def init_count_english_german():
  counts = {}
  for ew in Voc_english_german:
    init_foreign_lang_2 = {}
    for gw in Voc_german_english:
      init_foreign_lang_2[gw] = 0
    counts[ew] = init_foreign_lang_2
  return counts

def word_translation_probabilities():   
  convergence_point = 0.0021  
  
## Initializing variables  
  error_value = 10
  prob = deepcopy(english_translation_probability_2)
  iter = 0
## loop for convergence to compare the values
  while (error_value > convergence_point):
        
    last_trans_prob = deepcopy(prob)
  
    total_german = init_total_german()
    
    counts = init_count_english_german()
    
    total_german, counts = sentence_pair(counts,total_german,prob)
                    
    prob = update_trans_prob(counts,total_german,prob)
    
    error_value = error_cal(prob,last_trans_prob) 
    
    iter  = iter + 1 
    print( iter , error_value)
    
  return prob

In [46]:
##printing model per iteration and their respective error rates 

german_prob = word_translation_probabilities()

1 0.1775512209834461
2 0.07804514337783786
3 0.04980991590017922
4 0.037188280712541426
5 0.025645502726177195
6 0.018760479003059497
7 0.014835621480698976
8 0.011969915381560435
9 0.009627444370879097
10 0.007729159088176801
11 0.006230721805926982
12 0.00506452362756086
13 0.004161810083462091
14 0.003463917232738015
15 0.002922924101568709
16 0.0025003223116882084
17 0.0021659934583184324
18 0.001897274303796221


Printing word pairs with probability greater than 0.2.

In [47]:
###priting english german translation probability tables

for ew in german_prob.keys():
  for gw,pb in german_prob[ew].items():
    if pb > 0.2:
      print("{}           {}           {}".format(ew,gw,pb))


base           basis           0.33591290215231356
there           fälle           0.35960603875406616
there           gab           0.5275943291534336
there           benötigen           0.23877620093278878
there           bestehen           0.7493559103326707
there           weitere           0.2642533725785605
there           augen           0.5257631788764054
there           anscheinend           0.31002922289129853
there           zuletzt           0.4631423546930313
there           gefahr           0.26379504116835967
there           einfach           0.5009109373306248
there           sicherlich           0.28980169865016236
there           gibt           0.7244923094314192
there           brandschutzbelehrungen           0.2672660979225157
there           dort           0.5830262041741462
there           indonesien           0.39099150426245644
object           einwandfrei           0.22587362401764982
newspaper           anfragen           0.26761332364680956
slim           he

In [52]:
###german vs english correspoding sentences printed into tokens and separate lists for each sentence

for gs in Pre_Ger_Eng_Sent:
  ewds = []

  for gw in gs.split():
    value_max = -999999
    for ew in german_prob.keys():
        if german_prob[ew][gw] > value_max:
          value_max = german_prob[ew][gw]
          cewd = ew

    ewds.append(cewd)
  print(gs.split())
  print(ewds)

['wiederaufnahme', 'der', 'sitzungsperiode']
['resumption', 'the', 'you']
['ich', 'erkläre', 'die', 'am', 'freitag', 'dem', '17', 'dezember', 'unterbrochene', 'sitzungsperiode', 'des', 'europäischen', 'parlaments', 'für', 'wiederaufgenommen', 'wünsche', 'ihnen', 'nochmals', 'alles', 'gute', 'zum', 'jahreswechsel', 'und', 'hoffe', 'daß', 'sie', 'schöne', 'ferien', 'hatten']
['i', 'pleasant', 'the', 'on', 'friday', 'the', '17', 'december', 'pleasant', 'you', 'the', 'european', 'parliament', 'for', 'pleasant', 'like', 'you', 'once', 'everything', 'a', 'the', 'pleasant', 'and', 'hope', 'that', 'you', 'pleasant', 'pleasant', 'i']
['wie', 'sie', 'feststellen', 'konnten', 'ist', 'der', 'gefürchtete', 'milleniumbug', 'nicht', 'eingetreten', 'doch', 'sind', 'bürger', 'einiger', 'unserer', 'mitgliedstaaten', 'opfer', 'von', 'schrecklichen', 'naturkatastrophen', 'geworden']
['as', 'you', 'truly', 'were', 'is', 'the', 'truly', 'truly', 'not', 'truly', 'the', 'are', 'citizens', 'have', 'our', 'memb