# Number Data Generation

##Import Libraries

In [1]:
import random
# !pip install num_to_word
# from num_to_word import num_to_word
!pip install num2words
from num2words import num2words

Defaulting to user installation because normal site-packages is not writeable


Importing the Indic-num2words repository

In [2]:
! git clone https://github.com/sutariyaraj/indic-num2words.git

Cloning into 'indic-num2words'...
remote: Enumerating objects: 120, done.[K
remote: Counting objects: 100% (50/50), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 120 (delta 17), reused 31 (delta 6), pack-reused 70[K
Receiving objects: 100% (120/120), 51.33 KiB | 1.43 MiB/s, done.
Resolving deltas: 100% (44/44), done.


In [5]:
import sys
sys.path.insert(0,'indic-num2words')

In [7]:
from num_to_words.num_to_words import num_to_word

## Step - 1 : Random Number Generation

In [8]:
import random

def random_number_gen(max_length: int) -> str:
  
  '''
    input: max_length - int -  denotes the maximum length of the random number
    return: number - string - the random number generated
  '''

  #First generating a random number to get the length (1-max_length)
  length = random.randrange(1,max_length+1)

  #Now generating a random number of above length
  number = ''.join(["{}".format(random.randrange(0, 10)) for _ in range(0, length)])
  return number

In [9]:
print(random_number_gen(10))

0


## Step - 2 : Random Segments Generation

In [10]:
def random_segment_gen(num, max_segment_len): 
  '''
  Inputs - 
    #num is a string of 10 digits - string
    #max_segment_len is the max_len of every segment - int

  Output - 
    returns a list of every segment of num 
  '''

  n = len(num)
  temp = num
  segment_list = []

  while(n>0):
    #get random segment len in the range (1, max_segment_len)
    segment_len = random.randrange(1, max_segment_len+1)

    #get the list of all segments of len segment_len from num
    segment_list.append(temp[0:segment_len])

    #update len of the string left
    n -= segment_len

    #update the string without the prev segments
    temp = temp[segment_len:]

  return segment_list



In [11]:
print(random_segment_gen("11711506", 3))

['1', '1', '7', '1', '150', '6']


## Step -3 Converting to Words (Num2Word)

In [12]:
def choose_lang(choice):
  '''
  Function to return the language string based on the integer language
  Input -
    choice is the numerical value mapped to the language

  Output - 
    returns lang code for num2words
  '''

  language_dict = {1:"en", 2:"hi"}
  return language_dict[choice]
  # if(choice == 1):
  #   return 'en'

  # if(choice == 2):
  #   return 'hi'


In [13]:
def check_double_triple(seg) -> bool:

  '''
  This function is used to check if a segment has all the same characters
  or not.
  Input: seg - str - the string of numbers
  Output: True - if all same chars
          False - if not all same chars
  '''

  if seg.count(seg[0]) == len(seg):
    return True
  else:
    return False
    

In [14]:
def output_individual(seg, lang):

    '''
    Function to output individual digits of the segment
    input:
      num = string of numbers
      lang = language used to represent the number num
    Output: the number represented in the form of words in the required lang
    '''

    num_word_list = []
    for i in range(len(seg)):
        digit_word = num_to_word(seg[i], lang)
        num_word_list.append(digit_word)
    return " ".join(num_word_list)

In [15]:
def output_directly(seg, lang):
    '''
    Function to output the whole number of the segment directly
    input:
      num = string of numbers
      lang = language used to represent the number num
    Output: the number represented in the form of words in the required lang
    '''

    if lang == 'en':
      #When the language is english, we are using the num2words library
      num_word = num2words(seg, lang = lang)
    else:
      #For other indic languages we are using indic-num2words library
      num_word = num_to_word(seg, lang = lang)
    return num_word




In [16]:
#Defining the various counters for each sub case

segments_count = 0
case_en = 0
case_hi = 0

case_dob_trip = 0
case_no_dob_trip = 0

case_all_zero = 0
case_all_zero_dobtrp = 0
case_all_zero_ind = 0

case_not_all_zero = 0
case_not_all_zero_dobtrp = 0
case_not_all_zero_ind = 0
case_not_all_zero_direct = 0


case_no_zero = 0
case_no_zero_ind = 0
case_no_zero_direct = 0

case_zero = 0
case_zero_no_msblsb = 0
case_zero_no_msblsb_not = 0
case_zero_no_msblsb_ind = 0
case_zero_no_msblsb_dir = 0

case_zero_msb = 0
case_zero_msb_00x = 0
case_zero_msb_ind = 0
case_zero_msb_dir = 0

case_zero_lsb = 0
case_zero_lsb_ind = 0
case_zero_lsb_dir = 0


In [17]:
double = 'डबल '
triple = 'ट्रिपल '


def double_triple_case(seg, lang) -> str:

    '''
    Case where the segment has double or triple repeating digits,
    Input: 
       num = string of numbers
       lang = language used to represent the number num
    Output: the number represented in the form of words in the required lang
    '''

    global case_all_zero 
    global case_all_zero_dobtrp
    global case_all_zero_ind

    global case_not_all_zero
    global case_not_all_zero_dobtrp 
    global case_not_all_zero_ind 
    global case_not_all_zero_direct 


    num_word = ""
    digit_word = num_to_word(seg[0], lang)


    #Case when all the digits are 0s
    if seg[0] == '0':
        
        case_all_zero += 1
        #choose randomly between the below two subcases 
        i = random.randint(1, 2)
        
        if i == 1:
            
            case_all_zero_dobtrp += 1
            #Subcase 1: Here output as "double" or "triple" zero
            if len(seg) == 1:
                num_word = digit_word
            elif len(seg) == 2:
                num_word = double + digit_word
            elif len(seg) == 3:
                num_word = triple + digit_word

        else:
            #Subcase 2: Here output the individual digits
            case_all_zero_ind += 1
            num_word = output_individual(seg, lang)

    #The case where all the digits are non 0
    else:

        case_not_all_zero += 1

        #choose randomly between the below three sub cases
        i = random.randint(1, 3)

        if i == 1:

            case_not_all_zero_dobtrp += 1
            #Subcase 1: Here output as "double" or "triple" number
            if len(seg) == 1:
                num_word = digit_word
            elif len(seg) == 2:
                num_word = double+ digit_word
            elif len(seg) == 3:
                num_word = triple + digit_word

        elif i == 2:
            #Subcase 2: Here output the individual digits
            case_not_all_zero_ind += 1
            num_word = output_individual(seg, lang)
        
        else:
            #Subcase 3: Here output the number directly
            case_not_all_zero_direct += 1
            num_word = output_directly(seg, lang)
            #If the number is 111 then randomly choose between
            #"one hundred and eleven" and "hundred and eleven" (only if en)
            if len(seg) == 3 and seg[0] == '1' and lang == "en":
              j = random.randint(1,2)
              if j == 1:
                pass
              else:
                num_word = num_word[4:]

    
    return num_word


In [18]:
def check_zero(num):
  '''
  Check if the given number has a zero, and if it does, return its position in the input string

  Input: num - string of numbers
  Output: list of positions where zero exists

  '''
  #length of the segment
  n = len(num)
  temp = num
  zero_indices_list = []
  i = 0

  #check all characters in the segment
  for c in num:
    if(c=='0'):
      zero_indices_list.append(i)
    i+=1
  
  return zero_indices_list

In [19]:
def no_double_triple(num, lang) -> str: 
  '''
  If the segment does not contain any double or triple repeating digits,
  handle different cases under non repeating numbers.
  Input: 
    num = string of numbers
    lang = language used to represent the number num
  Output: the number represented in the form of words in the required lang
  '''

  global case_no_zero
  global case_no_zero_ind
  global case_no_zero_direct

  global case_zero 
  global case_zero_no_msblsb 
  global case_zero_no_msblsb_not 
  global case_zero_no_msblsb_ind 
  global case_zero_no_msblsb_dir 

  global case_zero_msb 
  global case_zero_msb_00x 
  global case_zero_msb_ind 
  global case_zero_msb_dir 

  global case_zero_lsb 
  global case_zero_lsb_ind 
  global case_zero_lsb_dir

  zero_indices_list = check_zero(num)
  num_zeros = len(zero_indices_list)

  num_word = ""

  #if there are no zeros:
  if(num_zeros==0):
      case_no_zero += 1

      #the good case
      i = random.randint(1,2)
      if i == 1:
          #print("case1a")
          #the individual digit case
          case_no_zero_ind += 1
          num_word = output_individual(num, lang)
      else:
          #print("case1b")
          #num2word
          case_no_zero_direct += 1
          num_word = output_directly(num, lang)
          #If the number is 1XX then randomly choose between
          #"one hundred and XXXX" and "hundred and XXXX" (only if en)
          if len(num) == 3 and num[0] == '1' and lang == 'en':
            j = random.randint(1,2)
            if j == 1:
                pass
            else:
                num_word = num_word[4:]

    

  #atleast one zero
  else:

      case_zero += 1

      #check whether the zero is not in MSB and LSB
      if (0 not in zero_indices_list) and (len(num)-1 not in zero_indices_list):

          case_zero_no_msblsb += 1

          if lang == "en":
            i = random.randint(1,3)
          else:
            i = random.randint(2,3)

          if i == 1 :
              #print("case2aa")
              #not is considered only when language is english
              #digit not digit case
              case_zero_no_msblsb_not += 1
              num_word += num_to_word(num[0], lang)
              num_word += random.choice([" not ", " naught "])
              num_word += num_to_word(num[2], lang)

          elif i == 2:
              case_zero_no_msblsb_ind += 1
              #print("case2ab")
              #digit zero digit case
              num_word = output_individual(num, lang)
          else:
              #print("case2ac")
              #num_2_word case
              case_zero_no_msblsb_dir += 1
              num_word = output_directly(num, lang)
              #If the number is 10X then randomly choose between
              #"one hundred and XXXX" and "hundred and XXXX" (only if en)
              if len(num) == 3 and num[0] == '1' and lang == 'en':
                j = random.randint(1,2)
                if j == 1:
                    pass
                else:
                    num_word = num_word[4:]

      #case when zero in the MSB position
      elif 0 in zero_indices_list:

          case_zero_msb += 1
          if 1 in zero_indices_list:
              case_zero_msb_00x += 1
              #The special case where 00X
              #print("case2ba")
              if lang == "en":
                i = random.randint(1,2)
              else:
                i = 1
              if i == 1:
                #individual digit case
                num_word = output_individual(num, lang)
              else:
                #not not digit case
                num_word += random.choice(["not", "naught"])
                num_word += random.choice([" not ", " naught "])
                num_word += num_to_word(num[-1], lang)
                
              return num_word

          i = random.randint(1,2)
          if i == 1:
              #print("case2bb")
              #individual digit case
              case_zero_msb_ind += 1
              num_word = output_individual(num, lang)
          else:
              #print("case2bc")
              case_zero_msb_dir += 1
              num_word = num_to_word(0, lang)+" " + num_to_word(num[1:], lang)

      #case where zero in LSB position
      elif len(num)-1 in zero_indices_list:

          case_zero_lsb += 1

          i = random.randint(1,2)
          if i == 1:
              #print("case2ca")
              #individual digit case
              case_zero_lsb_ind += 1
              num_word = output_individual(num, lang)
          else:
              #print("case2cb")
              #num2word
              case_zero_lsb_dir += 1
              num_word = output_directly(num, lang)
              #If the number is 1X0 then randomly choose between
              #"one hundred and XXXX" and "hundred and XXXX" (only if en)
              if len(num) == 3 and num[0] == '1' and lang == 'en':
                j = random.randint(1,2)
                if j == 1:
                  pass
                else:
                  num_word = num_word[4:]



  return num_word               



In [20]:
def generate_data(segment_list, num_of_lang, lang_choice = None):
  '''
  Inputs - 
    segment_list = list of segments - each segment is a string of digits of variable len
    num_of_lang = total number of languages for the data

  Outputs - list of words(in En or Hi) corresponding to input list
  '''

  global case_en 
  global case_hi 

  global case_dob_trip
  global case_no_dob_trip
  global segments_count

  final_num = []
  #On every segment
  for seg in segment_list:
    segments_count += 1
    #1. Choose the language randomly - 0 = english, 1 = hindi

    if lang_choice is None:
      lang_num = random.randrange(1, num_of_lang+1)
      lang = choose_lang(lang_num)
    else:
      lang = lang_choice

    if lang == 'en':
      case_en += 1
    elif lang == 'hi':
      case_hi += 1

    
    #Handle the double /triple case
    if check_double_triple(seg):
        case_dob_trip += 1
        num = double_triple_case(seg, lang)

    #Handle the other case
    else:
        case_no_dob_trip += 1
        num = no_double_triple(seg, lang)

    final_num.append(num)


  return " ".join(final_num)



In [21]:


def print_stats(n):
  print("Number of points: ", n)
  print("Number of segments: ", segments_count)
  print("% of English Segments: ", case_en/segments_count)
  print("% of Hindi Segments: ", case_hi/segments_count)
  print("**********************************************************")
  print("% of double/triple digit cases: ",case_dob_trip/segments_count)
  print("% of non double/triple cases: ",case_no_dob_trip/segments_count)
  print("**********************************************************")
  print("% of all zero cases: ", case_all_zero/case_dob_trip)
  print("% of all zero double-triple cases: ", case_all_zero_dobtrp/case_all_zero)
  print("% of all zero individual cases: ", case_all_zero_ind/case_all_zero)
  print()
  print("% of not all zero cases: ", case_not_all_zero/case_no_dob_trip)
  print("% of not all zero double-triple cases: ", case_not_all_zero_dobtrp/case_not_all_zero)
  print("% of not all zero individual cases: ", case_not_all_zero_ind/case_not_all_zero)
  print("% of not all zero direct number cases: ",case_not_all_zero_direct/case_not_all_zero)
  print("************************************************************")
  print("% of no zero case: ", case_no_zero/case_no_dob_trip)
  print("% of atleast one zero case: ", case_zero/case_no_dob_trip)
  print()
  print("% of no zero individual cases: ", case_no_zero_ind/case_no_zero)
  print("% of no zero direct cases: ", case_no_zero_direct/case_no_zero)
  print()
  print("% of zero not in MSB/LSB cases: ", case_zero_no_msblsb/case_zero)
  print("% of zero not in MSB/LSB 'not' cases: ", case_zero_no_msblsb_not/case_zero_no_msblsb)
  print("% of zero not in MSB/LSB individual cases: ", case_zero_no_msblsb_ind/case_zero_no_msblsb)
  print("% of zero not in MSB/LSB direct cases: ", case_zero_no_msblsb_dir/case_zero_no_msblsb)
  print()
  print("% of zero in MSB cases: ", case_zero_msb/case_zero)
  print("% of zero in MSB 00X cases: ", case_zero_msb_00x/case_zero_msb)
  print("% of zero in MSB individual cases: ", case_zero_msb_ind/case_zero_msb)
  print("% of zero in MSB direct cases: ", case_zero_msb_dir/case_zero_msb)
  print()
  print("% of zero in LSB cases: ", case_zero_lsb/case_zero)
  print("% of zero in LSB individual cases: ", case_zero_lsb_ind/case_zero_lsb)
  print("% of zero in LSB direct cases: ", case_zero_lsb_dir/case_zero_lsb)




## Step - 4 Combining All

In [23]:
# from tqdm import tqdm

ModuleNotFoundError: No module named 'tqdm'

In [25]:
n = 1000000
max_length = 10
max_segment_len = 3
num_of_lang = 2
num_list = []
num_word_list = []

f = open('num_word_data.tsv', 'w')

for i in (range(n)):
  num = random_number_gen(max_length)
  num_list.append(num)
  segment_list = random_segment_gen(num, max_segment_len)
  final_word = generate_data(segment_list, num_of_lang, "hi")
  num_word_list.append(final_word)

  #writing to the file
  line = final_word + "\t" + num +"\n"
  f.write(line)


f.close()

print_stats(n)



Number of points:  1000000
Number of segments:  3096823
% of English Segments:  0.0
% of Hindi Segments:  1.0
**********************************************************
% of double/triple digit cases:  0.4835594414017204
% of non double/triple cases:  0.5164405585982796
**********************************************************
% of all zero cases:  0.10002083475236695
% of all zero double-triple cases:  0.5002904240190679
% of all zero individual cases:  0.4997095759809322

% of not all zero cases:  0.8426786300470511
% of not all zero double-triple cases:  0.3332465198554296
% of not all zero individual cases:  0.33316564234182694
% of not all zero direct number cases:  0.3335878378027435
************************************************************
% of no zero case:  0.7688918762602973
% of atleast one zero case:  0.2311081237397027

% of no zero individual cases:  0.5000073188106445
% of no zero direct cases:  0.4999926811893555

% of zero not in MSB/LSB cases:  0.15189777526466586

In [None]:
print(num_word_list[:100])

In [None]:
print(num_list[:100])