Importation des librairies utilisées et des variables utiles

In [1]:
import unicodedata,string
import numpy as np
import pandas as pd
from collections import Counter
import os

#liste des 27 caractères "autorisés
letters = []
for x in string.ascii_lowercase+" ":
    letters.append(x)

## Etape 1

Traite les donnés brut .txt pour ne garder que 27 caractère (avec ou sans espaces)

In [2]:
#Ouvre un dossier (foldername) et concatène tous les fichiers .txt en un ficher .txt au nom de l'auteur
def concatennation(foldername, auteur):
    file_conca = open('textes/'+auteur+'_conca'+'.txt','a')
    for fichier in os.listdir(foldername):
        if fichier[-4:] == '.txt':
            with open(foldername + fichier,'r') as f:
                file_conca.write(f.read())
    file_conca.close()

#simplification d'un fichier .txt, enlève la ponctuation, les majuscules, les chiffres
#avec_espace permet de décider si on supprime aussi les espaces
def simplification(filename, avec_espaces):
    file1 = open(filename, 'r')
    data = file1.read()
    file1.close()
    
    #normalisation de l'encodage
    data = unicodedata.normalize('NFKD', data)
    
    #on supprime les majuscules
    data = data.lower()
    
    #remplace la ponctuation et les chiffres par des espaces
    dataBis = ""
    for x in data:
        if x in (string.punctuation+string.digits): 
            x = " "
        dataBis = dataBis + x
    
    #supprime les caractères spéciaux
    data = ''.join(x for x in dataBis if x in letters)
    
    #supprimme les espaces en trop et tous les espaces si avec_espaces = false
    if(avec_espaces):
        data = ' '.join(data.split())
        file2 = open(filename[:-4]+'_simplifie_avec_espaces.txt','w')
        file2.write(data)
    
    else:    
        data = ''.join(data.split())
        file2 = open(filename[:-4]+'_simplifie_sans_espaces.txt','w')
        file2.write(data)
    
    file2.close()

#return un string contenant l'ensemble d'un fichier .txt simplifié
#avec_espace permet de décider si on supprime aussi les espaces
def getData(filename, avec_espaces):
    simplification(filename, avec_espaces)
    if(avec_espaces):
        file = open(filename[:-4]+'_simplifie_avec_espaces.txt','r')
    else:
        file = open(filename[:-4]+'_simplifie_sans_espaces.txt','r')
    data = file.read()
    file.close()
    return data

## Etape 2

In [3]:
#on compte le nombre d'occurence de chacun des N_grams, résultat sous forme de dictionnaire
def N_grams(n, filename, avec_espaces):
    data=getData(filename, avec_espaces)
    
    #on met dans une liste l'ensemble des N_grams du fichier
    N_grams=[]
    for x in range(len(data)-n+1):
       N_grams.append(data[x:x+n])
    
    len(N_grams)
    ctn = Counter(N_grams)
    ctn['data_length'] = len(data)
    return ctn

#return la fréquence des des N_grams de taille n
def frequence_N_grams(n, filename, avec_espaces):
    ctn = N_grams(n,filename, avec_espaces)
    length = ctn.get('data_length')
    del ctn['data_length']
    
    if(avec_espaces):
        if(n==1): letters_n = letters
        if(n==2): letters_n = [x + y for x in letters for y in letters]
        if(n==3): letters_n = [x + y + z for x in letters for y in letters for z in letters]
        if(n==4): letters_n = [x + y + z + t for x in letters for y in letters for z in letters for t in letters]
        frequence = pd.Series(np.zeros(pow(27,n)) , index=letters_n)
        
    else:
        if(n==1): letters_n = letters[:26]
        if(n==2): letters_n = [x + y for x in letters[:26] for y in letters[:26]]
        if(n==3): letters_n = [x + y + z for x in letters[:26] for y in letters[:26] for z in letters[:26]]
        if(n==4): letters_n = [x + y + z + t for x in letters[:26] for y in letters[:26] for z in letters[:26] for t in letters[:26]]
        frequence = pd.Series(np.zeros(pow(26,n)) , index=letters_n)
    
    for key in ctn.keys():
        frequence.loc [key] = ctn.get(key)
        
    return frequence/length

#return la matrice de transition de mémoire n
def matrice_transition_memoire_n(n,filename):
    ctn = N_grams(n+1,filename, True)
    length = ctn.get('data_length')
    del ctn['data_length']
    
    #on calcul
    if(n==0): letters_n = ['frequence']
    if(n==1): letters_n = letters
    if(n==2): letters_n = [x + y for x in letters for y in letters]
    if(n==3): letters_n = [x + y + z for x in letters for y in letters for z in letters]
    if(n==4): letters_n = [x + y + z + t for x in letters for y in letters for z in letters for t in letters]
    
    result = pd.DataFrame(np.zeros((pow(27,n),27)), index=letters_n, columns=letters)
    if(n==0):
        for key in ctn.keys():
            result.loc['frequence',key[-1]] = ctn.get(key)
        
    
    else:
        for key in ctn.keys():
            result.loc[key[:-1],key[-1]] = ctn.get(key)
        
    return result/length


#return un texte généré aléatoirement à partir de la matrice de transition
#le debut du texte est décidé par l'utilisateur 
def generation_text_mem_N(longueur_text, matrice_transition, debut_text):
    text = debut_text
    
    #on récupère la mémoire de la matrice de transition
    memoire = int(np.log(matrice_transition.shape[0])/np.log(27))
    
    #on traite le cas sans mémoire à part
    if(memoire==0):
        for i in range (longueur_text):
            rand = np.random.uniform(0,1)
            sum=0
            for l in letters:
                if(sum <= rand < matrice_transition.loc['frequence',l] + sum):
                    text = text +l
                    break
                sum = matrice_transition.loc['frequence',l] + sum
                
    else:
        sup = np.sum(matrice_transition,1)
        for i in range (longueur_text):
            rand = np.random.uniform(0,sup.loc[text[-memoire:]])
            sum=0
            for l in letters:
                if(sum <= rand < matrice_transition.loc[text[-memoire:],l] + sum):
                    text = text +l
                    break
                sum = matrice_transition.loc[text[-memoire:],l] + sum
    return text



In [4]:
%%time
m0 = matrice_transition_memoire_n(0,'textes/james_conca.txt')
m1 = matrice_transition_memoire_n(1,'textes/james_conca.txt')
m2 = matrice_transition_memoire_n(2,'textes/james_conca.txt')
m3 = matrice_transition_memoire_n(3,'textes/james_conca.txt')
m4 = matrice_transition_memoire_n(4,'textes/james_conca.txt')


CPU times: user 26.2 s, sys: 485 ms, total: 26.7 s
Wall time: 26.7 s


In [5]:
%%time
t0 = generation_text_mem_N(300, m0, '')
t1 = generation_text_mem_N(300, m1, 't')
t2 = generation_text_mem_N(300, m2, 'th')
t3 = generation_text_mem_N(300, m3, 'the')
t4 = generation_text_mem_N(300, m4, 'the ')

CPU times: user 363 ms, sys: 4.42 ms, total: 367 ms
Wall time: 359 ms


In [16]:
print('Texte mémoire 0 : \n \n' + t0)
print('\nTexte mémoire 1 : \n \n' + t1)
print('\nTexte mémoire 2 : \n \n' + t2)
print('\nTexte mémoire 3 : \n \n' + t3)
print('\nTexte mémoire 4 : \n \n' + t4)

Texte mémoire 0 : 
 
 iae n qhiop hhahtuneie eamtntn   m sdrm ntw enan n ns iohsngrveufrmawgp lwwtteaatneeskt hhpdqtes st hfotisa hn  oatchnhfropd cthpdat hineswoormr ris a    surweert aeik etteenhhkoeeeae mthabigaogtit a haiero  leitabdk  oeenebh hdoy yenoaiti   oeh o  ei rathpraotmohhao hcfeei m ldtkntb he ttghnktethp

Texte mémoire 1 : 
 
ttein con bl terir w f wacheasor huthero gs aly hityouinidens astombed e e hed hacofincon g htacof rthe bur the ysethashand chelope sttthy italinated e ny astebth the on mssinisinoff to dre gesity in to dikne o ave cho tou d athe triotheounsy he cerit thay as h f ik imect on sshave hematicham h fe cr

Texte mémoire 2 : 
 
then bould whiseetrater ing th anyth of thavermaking fack mucheyesel yestnion to hared to he i senythe the der el red as wings reforest but whate red sm on ded tow wassint it she sing emer ther haten he actle th so kne of toome but for itakepto why the solinat as quithus stere oplace witheplesto to co

Texte mémoire 3 : 
 
the won

## Etape 3

In [17]:
def f1(x, data, frequence2, frequence4):
    return np.log(frequence4.loc[data[x:x+4]]/(frequence2.loc[data[x:x+2]]*frequence2.loc[data[x+2:x+4]]))

def f2(x, data, frequence2, frequence3):
    return np.log((frequence2.loc[data[x:x+2]]*frequence3.loc[data[x+2:x+5]])/(frequence3.loc[data[x:x+3]]*frequence2.loc[data[x+3:x+5]]))

def rajout_espaces_1(filename, frequence2 , frequence4):
    data = getData(filename, False)
    length = len(data)
    x = 1
    
    while(x<len(data)-5):
        if(x==1):
            a = f1(x-1, data, frequence2, frequence4)
            b = f1(x, data, frequence2, frequence4)
            c = f1(x+1, data, frequence2, frequence4)
            if(b<c and b<a):
                data = data[:x+2] + ' ' + data[x+2:]
                max_avant = True
            else:
                max_avant = False
            
        else:
            if(max_avant):
                x+=1
                a = f1(x-1, data, frequence2, frequence4)
                b = f1(x, data, frequence2, frequence4)
                c = f1(x+1, data, frequence2, frequence4)
                if(b<c and b<a):
                    data = data[:x+2] + ' ' + data[x+2:]
                    max_avant = True
                else:
                    max_avant = False
                
            else:
                a = b
                b = c
                c = f1(x+1, data, frequence2, frequence4)
                if(b<c and b<a):
                    data = data[:x+2] + ' ' + data[x+2:]
                    max_avant = True
                else:
                    max_avant = False
        x+=1
    
    with open(filename[:-4] + '_rajout_espaces_1.txt','w') as f:
        f.write(data)
        
    return(data)

def rajout_espaces_2(filename, frequence2 , frequence3):
    data = getData(filename, False)
    length = len(data)
    x = 1
    
    while(x<len(data)-6):
        if(x==1):
            a = f2(x-1, data, frequence2, frequence3)
            b = f2(x, data, frequence2, frequence3)
            c = f2(x+1, data, frequence2, frequence3)
            if(b>c and b>a):
                data = data[:x+2] + ' ' + data[x+2:]
                max_avant = True
            else:
                max_avant = False
            
        else:
            if(max_avant):
                x+=1
                a = f2(x-1, data, frequence2, frequence3)
                b = f2(x, data, frequence2, frequence3)
                c = f2(x+1, data, frequence2, frequence3)
                if(b>c and b>a):
                    data = data[:x+2] + ' ' + data[x+2:]
                    max_avant = True
                else:
                    max_avant = False
                
            else:
                a = b
                b = c
                c = f2(x+1, data, frequence2, frequence3)
                if(b>c and b>a):
                    data = data[:x+2] + ' ' + data[x+2:]
                    max_avant = True
                else:
                    max_avant = False
        x+=1
    
    with open(filename[:-4] + '_rajout_espaces_2.txt','w') as f:
        f.write(data)
        
    return(data)

def rajout_espaces_1_2(filename, frequence2 , frequence3, frequence4):
    data = getData(filename, False)
    length = len(data)
    x = 1
    
    while(x<len(data)-6):
        if(x==1):
            a = f1(x-1, data, frequence2, frequence4)
            b = f1(x, data, frequence2, frequence4)
            c = f1(x+1, data, frequence2, frequence4)
            d = f2(x-1, data, frequence2, frequence3)
            e = f2(x, data, frequence2, frequence3)
            f = f2(x+1, data, frequence2, frequence3)
            
            if(b<c and b>a and e>d and e>f):
                data = data[:x+2] + ' ' + data[x+2:]
                max_avant = True
            else:
                max_avant = False
            
        else:
            if(max_avant):
                x+=1
                a = f1(x-1, data, frequence2, frequence4)
                b = f1(x, data, frequence2, frequence4)
                c = f1(x+1, data, frequence2, frequence4)
                d = f2(x-1, data, frequence2, frequence3)
                e = f2(x, data, frequence2, frequence3)
                f = f2(x+1, data, frequence2, frequence3)
                if(b<c and b>a and e>d and e>f):
                    data = data[:x+2] + ' ' + data[x+2:]
                    max_avant = True
                else:
                    max_avant = False
                
            else:
                a = b
                b = c
                c = f1(x+1, data, frequence2, frequence4)
                d = e
                e = f
                f = f2(x+1, data, frequence2, frequence3)
                if(b<c and b>a and e>d and e>f):
                    data = data[:x+2] + ' ' + data[x+2:]
                    max_avant = True
                else:
                    max_avant = False
        x+=1
    
    with open(filename[:-4] + '_rajout_espaces_1_2.txt','w') as f:
        f.write(data)
        
    return(data)

In [18]:
%%time
frequence2 = frequence_N_grams(2, 'textes/melville_moby_dick.txt', True)
frequence3 = frequence_N_grams(3, 'textes/melville_moby_dick.txt', True)
frequence4 = frequence_N_grams(4, 'textes/melville_moby_dick.txt', True)


CPU times: user 4.58 s, sys: 71.9 ms, total: 4.65 s
Wall time: 4.69 s


In [19]:
p = rajout_espaces_1_2('textes/test.txt', frequence2, frequence3, frequence4)

  
  """
  """
  """
  


In [20]:
print(p)

witha meta p hysicalprofessoryesase very oneknowsmedita tionandwatera rew edded fore verbut hereisan artisthed esirestopainty outhedreami estshadiestquietestmostenchantingbitof roman ticlands capein allthevalleyofthesacow hat isthechiefele mentheemployst herest andhistreeseach withahollowtrunkasifahermitandacrucifixwerewithinandheresleepshismeadow andtheresleephisca ttleandupfromyondercott agegoesas le epysmokedeepintodist ant woodl andswindsamazywayreach ingtoo verl a ppingspursofmountainsbathed int heirhillsidebluebutth ou gh thepictureliesthustra ncedandthoughthispinetreeshakes downits s ighslikeleavesuponthisshepherdshe adyet allwerevainunl esstheshepherdseyewerefixed uponthemagicstream beforehimgov is ittheprairiesin junew hen fors coresonscoresofmiles youwadekneedeepamongtigerli lieswhat istheonecharmwantingwaterthereis notad ropofwatertherew ereniaga rabutac atar actofsandwouldyoutravelyourth ousandmilestoseeitwhydidthepoorpoetoftennesseeuponsuddenlyreceivingtwohandfulsofsilverd