In [None]:
# 1. Driver Code - Starting Point
# 2. Language Identification Algorithm Class
#     1. Public methods for driving identification process
#       0. Min Distance returning function
#       1. Profile Calculation
#       2. Tri-gram Calculation
#       3. Distance Calculation for all Languages
#         1. Distance Calculation for specific language
#     2. Constructor (if required)
#     3. Private Members
#       1. Corpus
#         1. An Crubadan
#       2. N-gram specific Parameters

In [None]:
!pip install regex



In [None]:
import nltk
nltk.download('all')

In [None]:
import re
import regex
from nltk.corpus import crubadan
from nltk.util import trigrams
from nltk import word_tokenize
from nltk import FreqDist
    # Frequency Distribution

class LanguagePredictionAlgo:

  # Constructor
  # self => this in C++/Java
  def __init__(self):
    '''
    Initialization of libraries
    Initialization of Corpus
    '''

    self.corpus = crubadan

    for language in self.corpus.langs():
      self.corpus.lang_freq(language)

  # def clear_punctuations(self, text):
  #   '''
  #   Clear the punctuation marks to clean the text
  #   '''
  #   all_punctuations = """!#$%|()*+"""
  #   clean_string = re.sub([{all_punctuations}], '', text)
  #   return clean_string

  def clear_punctuations(self, text):
    '''
    Clear the punctuation marks to clean the text
    '''

    return regex.sub(r"[^\P{P}\']+", "", text) 
  
  def ngram_profile(self, text):
    '''
    Generate the trigram profile of the text
    '''
    clean_text = self.clear_punctuations(text)
    tokens = word_tokenize(clean_text)

    profile = FreqDist()

    for token in tokens:
      token_trigrams_gen = trigrams("<" + token + ">")
      joined_trigrams = ["".join(t) for t in token_trigrams_gen]

      for one_trigram in joined_trigrams:
        if one_trigram in profile:
          profile[one_trigram] += 1
        else:
          profile[one_trigram] = 1

      return profile

  def calc_dist(self, lang, trigram, text_profile):
    # Calculate the out-of-place measure between a trigram's position in
    # the crubadan lang profile and text_profile
    # "Index of a trigram in the profile"
    # lang -> each language in the An Crubadan corpus
    # trigram -> individiual trigram characterwise <further> -> <fu, fur, urt.....
    # text_profile -> trigram profile of the Hindi text

    lang_profile = self.corpus.lang_freq(lang)
    dist = 0

    if trigram in lang_profile:
      lang_profile_index = list(lang_profile.keys()).index(trigram)
      # [how frequent is trigram in lang_profile]
      text_index = list(text_profile.keys()).index(trigram)

      dist = abs(lang_profile_index - text_index)
    else:
      dist = 999999999
      # import sys; sys.maxsize
  
    return dist


  def get_all_lang_dists(self, text):
    distances = {}
    text_profile = self.ngram_profile(text)

    for lang in self.corpus._all_lang_freq.keys():
      lang_dist = 0
      for trigram in text_profile:
        lang_dist += self.calc_dist(lang, trigram, text_profile)

      distances[lang] = lang_dist
    
    return distances
  
  def predict_lang(self, text):
    all_lang_distances = self.get_all_lang_dists(text)
    return min(all_lang_distances, key=all_lang_distances.get)



In [None]:
# Driver Code
def identify_lang():
  lp = LanguagePredictionAlgo()

  # text = "I am a teacher"
  # text = "तब से कई अन्य परियोजनाओं को सम्पादकीय कारणों से विकिपीडिया से अलग किया गया है।"
  text = "The school's traditions include a 600-year-old ceremony in which the warden, wearing the Founder's Ring, admits each new scholar; \"Illumina\", an autumn celebration, in which candles are placed into niches all over the medieval walls around the playing fields; and \"Morning Hills\", held once a year, when all the school's pupils and teachers climb St Catherine's Hill for a roll call and prayers."

  # fr, ger, eng, hin - trigram profiles
  # text_profile vs fr; text_profile vs ger, text_profile vs eng; text_profile vs hin
  print(lp.predict_lang(text))

identify_lang()

eng


In [None]:

# from nltk.util import trigrams
# tokens = word_tokenize('I am noob')
# for token in tokens:
#   token_trigrams = trigrams(token)
#   print(token_trigrams)
#   trigram = ["".join(t) for t in token_trigrams]
#   print(trigram)

<generator object trigrams at 0x7f40e7086fc0>
[]
<generator object trigrams at 0x7f40e72e4200>
[]
<generator object trigrams at 0x7f40e70869e8>
['noo', 'oob']


In [None]:
from nltk import FreqDist


In [None]:
a = FreqDist()

In [None]:
a

FreqDist()

In [None]:
from nltk.corpus import crubadan

In [None]:
for language in crubadan.langs():
  crubadan.lang_freq(language)

In [None]:
crubadan._all_lang_freq.keys() 

dict_keys(['abk', 'abn', 'ace', 'ach', 'acu', 'ada', 'afr', 'agr', 'aja', 'aka', 'ako', 'alt', 'amc', 'ame', 'amh', 'ami', 'amr', 'arg', 'ang', 'arb', 'arl', 'arn', 'asm', 'ast', 'ava', 'ayr', 'azj', 'bak', 'bcc', 'ban', 'bar', 'bas', 'bba', 'bci', 'bel', 'bem', 'bfa', 'bul', 'bho', 'bis', 'bcl', 'bin', 'bam', 'ben', 'boa', 'bod', 'bre', 'bos', 'btb', 'bxr', 'buc', 'bug', 'bum', 'byv', 'cab', 'cat', 'cak', 'cbr', 'cbs', 'cbt', 'cbu', 'ceb', 'cha', 'chj', 'chk', 'chw', 'cic', 'cjk', 'cnh', 'cni', 'cos', 'cop', 'cot', 'cpu', 'crk', 'crs', 'csa', 'csb', 'ces', 'chu', 'cuk', 'chv', 'cym', 'czt', 'dan', 'dag', 'dar', 'ddn', 'deu', 'dga', 'dhv', 'diq', 'dsb', 'dua', 'dyo', 'dyu', 'dzo', 'ewe', 'efi', 'ell', 'emk', 'eml', 'eng', 'eng ', 'epo', 'spa', 'est', 'eus', 'pes', 'fub', 'fin', 'fij', 'fao', 'fon', 'fra', 'frr', 'frp', 'fud', 'fuf', 'fur', 'fri', 'gaa', 'gle', 'gag', 'gya', 'gla', 'gil', 'gjn', 'gkn', 'glg', 'gug', 'got', 'gsc', 'gsw', 'guc', 'guj', 'guw', 'glv', 'gym', 'hau', 'haw', '

In [None]:
# dict.get(lang) => lang_dist

# dd = {
#     key_1: val_1,
#     key_2: val_2,
#     key_3: val_3
# }
# distances = {
#     "ger": 100,
#     "hin": 2,
#     "eng": 200,
#     "fre": 2000
# }


# ll = [
#  [1, 2],
#  [2, 4],
#  [6, -1]
# ]


# Comparator -
# a < b
# a == b
# a > b

# a < b
# numerical_val(a) - numerical_val(b)
# +1 => a > b 
# -1 => a < b
# 0 => a == b

# a = [
#   a[i] = [1, 2],
#   a[i + 1] = [3, 7],
#   a[i + 2] = [4, 5],
#  .
#  .
#  .
#  .
# ]

# (a[i] vs a[j])
# custom_comparator(i, j) {
#    if (a[i][1] > a[j][1]) {
#        return +1;
#    }
#    else if (a[i][1] < a[j][1]) {
#      return -1;
#    } 
#    else {
#        return 0
#    }
# }


# eng_profile = {
#     trigram_1: freq_1,
#     trigram_2: freq_2, 
# }
# [trigram_2, trigram_1]

# (french test text)
# text_profile = {
#     tri_1: fre_1,
#     tri_2: fre_2
# }
# [tri_2, tri_1]

# if (tri_1 == trigram_2)
# abs(index(tri_1) - index(trigram_2))

# if (trigram_1 != tri_2)
#   sys.maxsize, MAXINT, +999999999 => >>>>>>>>> distance


