# Text Generation

# Markov Chain

In [1]:
#Random text

#text="the man was ....they....then....the....the dog....then"

#X is the sequence of 'K=3' (window size) and Y is predicted character or K+1 character

# X    Y     Freq  ie X=input or current state Y=output (predicted character)
#the  " "    3
#he_   m     1
#e_m   a     1
#_ma   n     2
#.     .     .
#.     .     .
#.     .     .
#.     .     .
#the   y     1
#.     .     .
#.     .     .
#.     .     .





In [2]:
def generateTable(data,k=4):
    T={}
    for i in range(len(data)-k): #we need to stop k char before the end of string to get output
        X=data[i:i+k]
        y=data[i+k]
        
        if T.get(X) is None:
            T[X]={}            #      X        y
            T[X][y]=1          #eg {'powe':{'r':1,    }}
        else:
            if T[X].get(y) is None:
                T[X][y]=1
            else:
                T[X][y]+=1
    return T
        
            
        
    
    

In [3]:
T=generateTable("hello hello helli helly")
T

{'hell': {'o': 2, 'i': 1, 'y': 1},
 'ello': {' ': 2},
 'llo ': {'h': 2},
 'lo h': {'e': 2},
 'o he': {'l': 2},
 ' hel': {'l': 3},
 'elli': {' ': 1},
 'lli ': {'h': 1},
 'li h': {'e': 1},
 'i he': {'l': 1}}

In [4]:
generateTable("the they then them the the the",k=3)

{'the': {' ': 3, 'y': 1, 'n': 1, 'm': 1},
 'he ': {'t': 3},
 'e t': {'h': 3},
 ' th': {'e': 6},
 'hey': {' ': 1},
 'ey ': {'t': 1},
 'y t': {'h': 1},
 'hen': {' ': 1},
 'en ': {'t': 1},
 'n t': {'h': 1},
 'hem': {' ': 1},
 'em ': {'t': 1},
 'm t': {'h': 1}}

In [5]:
#convert freq to probability
def convertFreqProb(T):
    for kx in T.keys():
        s=sum(T[kx].values())  # sum(T['hell'].values()) = 4
        for k in T[kx].keys():
            T[kx][k]=T[kx][k]/s
            
    return T
    

In [6]:
T=convertFreqProb(T)
T

{'hell': {'o': 0.5, 'i': 0.25, 'y': 0.25},
 'ello': {' ': 1.0},
 'llo ': {'h': 1.0},
 'lo h': {'e': 1.0},
 'o he': {'l': 1.0},
 ' hel': {'l': 1.0},
 'elli': {' ': 1.0},
 'lli ': {'h': 1.0},
 'li h': {'e': 1.0},
 'i he': {'l': 1.0}}

# Train Markov Chain

In [7]:
def load_data(filepath):
    with open(filepath) as f:
        return f.read().lower()

In [8]:
#print(load_data(r'C:\Users\user\Desktop\DOCS\DS\Relevel\SQL\sql queries.txt'))

# Train Markov Model

In [9]:
def trainMarkovChain(text,k=4):
    T=generateTable(text,k)
    T=convertFreqProb(T)
    
    return T

In [26]:
txt=load_data(r'C:\Users\user\Desktop\DOCS\DS\Relevel\SQL\sql queries.txt')
model=trainMarkovChain(txt)
model

{'sele': {'c': 1.0},
 'elec': {'t': 1.0},
 'lect': {' ': 1.0},
 'ect ': {'d': 0.189873417721519,
  '*': 0.10126582278481013,
  'h': 0.0759493670886076,
  't': 0.02531645569620253,
  's': 0.0759493670886076,
  'c': 0.13924050632911392,
  ' ': 0.02531645569620253,
  'i': 0.02531645569620253,
  'n': 0.12658227848101267,
  'r': 0.0379746835443038,
  'a': 0.0759493670886076,
  '1': 0.012658227848101266,
  'm': 0.0379746835443038,
  'e': 0.05063291139240506},
 'ct d': {'i': 0.9333333333333333, '.': 0.06666666666666667},
 't di': {'s': 1.0},
 ' dis': {'t': 1.0},
 'dist': {'i': 1.0},
 'isti': {'n': 1.0},
 'stin': {'c': 1.0},
 'tinc': {'t': 1.0},
 'inct': {' ': 1.0},
 'nct ': {'c': 0.39285714285714285,
  'e': 0.07142857142857142,
  'b': 0.03571428571428571,
  'a': 0.10714285714285714,
  'l': 0.03571428571428571,
  's': 0.03571428571428571,
  'm': 0.03571428571428571,
  'x': 0.03571428571428571,
  'o': 0.03571428571428571,
  'n': 0.07142857142857142,
  'p': 0.14285714285714285},
 'ct c': {'i': 0

# Generate Text!

In [20]:
import random
def sample_next(context,T,k):
    context =context[-k:]  # to predict value after last char, last k chars are needed
        return " "
    possible_chars=list(T.get(context).keys())  #T.get(context).keys() == T[context].keys()
    possible_prob=list(T.get(context).values())
    return random.choices(possible_chars,weights=possible_prob)[0]


In [44]:
sample_next("sele",model,4)

'c'

In [53]:
sample_next("ect ",model,4)

'n'

In [61]:
def generateText(starting_sent,T,k=4,maxLen=100):
    sentence =starting_sent
    context=starting_sent[-k:]  #last k chars
    
    for i in range (maxLen):
        next_pred=sample_next(context,T,k)
        sentence += next_pred
        context= sentence[-k:]
    return sentence
    


In [63]:
generateText("india",model)

'india    and how manager_code from students) order by count(distinct a.n from students as table)\nb as h j'