 # Text Generation

In [None]:
"""
Markov Chain Model:

    * Probablistic MOdel for Text/Naturl Lang Generation
    * Simple and Effective way of generating new text:
    * Text, Lyrics, Story/Novel, Code
    
    * selects based on probability.
    * give past data ==> generate new data
    
    * 2 state probability: A _ _
        * find the seq of char whos probab is high
        * Character based model ==> generates 1 char at a time
                                ==> given some past char, generate next char
        * Computing probability is important.
        
    * Computing Probability:
        * We will have a transition table.( to get a table we will use dictionaries)
        
                                
        
    

Having Data we are going to train the model to predict or start generating text
"""

In [1]:
# code to get the transition table
def generateTable(data, k = 4):
    T = {}
    for i in range(len(data) - k):
        x = data[i:i+k] # taking only 3 chars
        y = data[i+k]
        
        if T.get(x) is None:
            T[x] = {}
            T[x][y] = 1
        else:
            if T[x].get(y) is None:
                T[x][y] = 1
            else:
                T[x][y] += 1
            
    return T
        

In [13]:
T = generateTable("hello hello helli helly")

In [14]:
T

{'hell': {'o': 2, 'i': 1, 'y': 1},
 'ello': {' ': 2},
 'llo ': {'h': 2},
 'lo h': {'e': 2},
 'o he': {'l': 2},
 ' hel': {'l': 3},
 'elli': {' ': 1},
 'lli ': {'h': 1},
 'li h': {'e': 1},
 'i he': {'l': 1}}

In [16]:
# the numbers can be converted into the probability.
T['hell'].values()

dict_values([2, 1, 1])

In [17]:
sum(T['hell'].values())

4

In [18]:
def convertFreqToProbab(T):
    for key in T.keys():
        s = sum(T[key].values())
        for k in T[key].keys():
            T[key][k] = T[key][k]/s
    
    return T

In [19]:
T = convertFreqToProbab(T)

In [20]:
T

{'hell': {'o': 0.5, 'i': 0.25, 'y': 0.25},
 'ello': {' ': 1.0},
 'llo ': {'h': 1.0},
 'lo h': {'e': 1.0},
 'o he': {'l': 1.0},
 ' hel': {'l': 1.0},
 'elli': {' ': 1.0},
 'lli ': {'h': 1.0},
 'li h': {'e': 1.0},
 'i he': {'l': 1.0}}

In [None]:
# Read actual data

In [55]:
def load_text(filepath):
    with open(filepath, encoding="utf8") as f:
        return f.read().lower()

In [56]:
text = load_text('./speech.txt')

In [58]:
#print(text)

In [None]:
# Training the Markov Chains

In [59]:
def trainMarkovChainModel(text, k = 4):
    T = generateTable(text, k)
    T = convertFreqToProbab(T)
    
    return T

In [60]:
model = trainMarkovChainModel(text) # pick the max probab max amt of time.

In [61]:
model

{'26 8': {' ': 1.0},
 '6 8 ': {'2': 1.0},
 ' 8 2': {'0': 1.0},
 '8 20': {'1': 1.0},
 ' 201': {'6': 0.20918367346938777,
  '7': 0.03571428571428571,
  '5': 0.3979591836734694,
  '4': 0.21173469387755103,
  '8': 0.01020408163265306,
  '3': 0.061224489795918366,
  ' ': 0.00510204081632653,
  '2': 0.00510204081632653,
  '9': 0.03571428571428571,
  '1': 0.015306122448979591,
  '0': 0.012755102040816327},
 '2016': {',': 0.6265060240963856,
  ';': 0.03614457831325301,
  '.': 0.1927710843373494,
  ' ': 0.10843373493975904,
  '-': 0.03614457831325301},
 '016,': {' ': 1.0},
 '16, ': {'i': 0.5714285714285714,
  's': 0.05357142857142857,
  'm': 0.07142857142857142,
  'a': 0.03571428571428571,
  'k': 0.07142857142857142,
  'h': 0.017857142857142856,
  'u': 0.05357142857142857,
  'b': 0.03571428571428571,
  'd': 0.017857142857142856,
  't': 0.03571428571428571,
  'c': 0.017857142857142856,
  'j': 0.017857142857142856},
 '6, i': {'n': 0.8787878787878788,
  'r': 0.09090909090909091,
  't': 0.030303030

In [62]:
# Generate Text!

In [63]:
import random

In [64]:
fruits = ["apple", "mango", "banana"]
probabilities = [0.7, 0.2, 0.1]

In [82]:
for i in range(10):
    print(random.choices(fruits, weights = probabilities)[0])

mango
apple
apple
mango
apple
apple
apple
apple
banana
banana


In [95]:
def sample_next_char(context, T, k):
    
    context = context[-k:] # take last four char
    
    if T.get(context) is None: # return if not present
        return " "
    
    possible_chars = list(T.get(context).keys())
    possible_probabs = list(T.get(context).values())
    
    return random.choices(possible_chars, weights = possible_probabs)[0]
        

In [96]:
sample_next_char("countrymen", model, 4)

't'

In [None]:
# generate para

In [97]:
def generateText(starting_sent, model,  k = 4, maxLen = 100):
    sentence = starting_sent
    context = starting_sent[-k:] # last four char
    
    for i in range(maxLen):
        next_pred = sample_next_char(context, model, k) 
        sentence += next_pred
        context = sentence[-k:]
    
    return sentence

    

In [99]:
prediction_sen = generateText("dear", model, k = 4, maxLen = 100)

In [100]:
print(prediction_sen)

dear agreeting. 
thus, when well. 
our share mediated that jacob zuma, 
next rock month limiter#
20 9 20


In [None]:
# it is probabilistic approach hence there is some problems, but it predicts 
# some correct words.
# This can be used for "word completion or auto suggestions."

In [None]:
# Exprimentation:

In [84]:
context = "hey boy, dear countrymen"

In [85]:
context[-4::]

'ymen'

In [87]:
model.get('ymen') # empty == because it does not exists
# The below are the possible keys

{',': 0.2825112107623318,
 't': 0.45739910313901344,
 ' ': 0.21973094170403587,
 '!': 0.013452914798206279,
 '.': 0.02242152466367713,
 '?': 0.004484304932735426}

In [88]:
list(model.get('ymen'))

[',', 't', ' ', '!', '.', '?']

In [107]:
st = "HellO"
cap = []
low = []
for ch in st:
    if ord(ch) >= 97 and ord(ch) <= 122:
        low.append(ch)
    else:
        cap.append(ch)

stt = ""
for ele in low:
    stt += ele
for ele in cap:
    stt += ele

In [108]:
stt

'ellHO'

In [124]:
dic = {
    5:5, 8:9, 8:7, 1:2, 10:0, 7:8
}

In [143]:
cnt = 0
for i in dic.items():
    print(i, "iii")
    summ = sum(i)
    print(summ)
    for j in dic.items():
        if j != i:
            summ2 = sum(j)
            print(j, "jj")
            if summ == summ2:
                cnt += 1
            
        
print("ans", cnt-2)
        

    

(5, 5) iii
10
(8, 7) jj
(1, 2) jj
(10, 0) jj
(7, 8) jj
(8, 7) iii
15
(5, 5) jj
(1, 2) jj
(10, 0) jj
(7, 8) jj
(1, 2) iii
3
(5, 5) jj
(8, 7) jj
(10, 0) jj
(7, 8) jj
(10, 0) iii
10
(5, 5) jj
(8, 7) jj
(1, 2) jj
(7, 8) jj
(7, 8) iii
15
(5, 5) jj
(8, 7) jj
(1, 2) jj
(10, 0) jj
ans 2


In [138]:
sum((10, 5))

15