In [1]:
import numpy as np 
import pandas as pd
import lxml.etree as ET
import tiktoken

# ***Load and build dataset***

In [2]:
tokenizer = tiktoken.get_encoding('gpt2')
SEED = 42
BLOCK_SIZE = 1024

In [3]:
def get_data():
    file_path = 'dataset/french-discussion-reddit/final_SPF_2.xml'
    # Initializes the parser
    parser = ET.XMLParser(recover=True)
    # Parses the file
    tree = ET.parse(file_path, parser=parser)
    xroot = tree.getroot()
    # One conversation -> one line in the data array
    dfcols = ['link_id', 'subreddit_id', 'uid',"comment_id",'score', 'parent_id', 'create_utc', 'text']
    data=np.array(([[ [node.attrib.get('link_id'),node.attrib.get('subreddit_id'), node.getchildren()[j].get('uid'), node.getchildren()[j].get('comment_id'), node.getchildren()[j].get('score'), node.getchildren()[j].get('parent_id'), node.getchildren()[j].get('create_utc'),node.getchildren()[j].text] for j in range(len(node.getchildren()))] for node in xroot]), dtype=object)
    print('number of conversations: ',data.shape[0])
    #one comments -> one line in the data array
    data=np.array([liste for conversation in data for liste in conversation], dtype=object)
    print('number of comments: ',data.shape[0])
    X = pd.DataFrame(data=data, columns=dfcols)["text"]
    X = np.array(X.values)
    print(X.shape)
    return X

# def get_data():
#     db_balanced = pd.read_csv("dataset/train-balanced-sarcasm.csv/train-balanced-sarcasm.csv")
#     X = "<User> : " + db_balanced["parent_comment"] + "<nl><AI> : " + db_balanced["comment"]
#     X = np.array(X.values)
#     print(X.shape)
#     return X

def get_chunks(data,block_size=8):
    values = []
    for _,tokens in enumerate(data) :
        if len(tokens)>(2*block_size)+1:
            upper_bound = len(tokens)-block_size
            nb = np.random.randint(upper_bound)
            values.append(tokens[nb:nb+block_size])

    values = np.vstack(values)
    return values

def encode_text(X):
    X = np.array([tokenizer.encode(str(value)) for value in X],dtype=object)
    return X

In [4]:
X = get_data()
X = encode_text(X)
X = get_chunks(X,BLOCK_SIZE)

print(f"Shape : {X.shape}, Block : {X[0]}")

number of conversations:  556622
number of comments:  1583083
(1583083,)
Shape : (1261, 1024), Block : [    6   756 10287 ... 28141  2184  1059]


# ***Build and learn the Model from Data***

In [21]:
def markov_model(X:list,d:int)->(np.ndarray,np.ndarray):
    A = np.zeros((d,d),dtype=np.float32)
    Pi = np.zeros(d,dtype=np.float32)
    for xi in X:
        # Count the number of times we see each initial state
        Pi[int(xi[0])] += 1
        # Count the number of transitions between states
        for j in range(len(xi)-1):
            current_transition,next_transition = int(xi[j]),int(xi[j+1])
            A[current_transition,next_transition] += 1

    # Normalize the distributions
    Pi = Pi / Pi.sum()
    A = A / np.maximum(A.sum(1).reshape(d, 1), 1)

    np.savetxt("parameters/Pi.txt",Pi)
    np.savetxt("parameters/A.txt",A)
    return Pi,A

def generate_sequence(Pi,A,T:int)->np.ndarray:
    # Generate a sequence of length T
    sequence = np.zeros(T)
    # Choose the first state according to the distribution Pi
    sequence[0] = np.random.choice(len(Pi),p=Pi)
    # Choose the next state according to the distribution A
    for t in range(1, T):
        # Choose the next state according to the distribution A
        # I'm using this approch because the sum of probabilities sum to 1.0000xxx so, np.random.choice doesn't work
        # And if i normalize the probabilities, i have a memory error
        random_number = np.random.uniform()
        cumulative_probabilities = np.cumsum(A[int(sequence[t - 1])])
        selected_index = np.searchsorted(cumulative_probabilities, random_number)
        sequence[t] = int(selected_index)
    
    sequence = sequence.astype(int)
    return tokenizer.decode(sequence)

In [6]:
Pi, A = markov_model(X,tokenizer.n_vocab)

# ***Generate Sequence***

In [None]:
def load_parameters():
    Pi = np.loadtxt("parameters/Pi.txt")
    A = np.loadtxt("parameters/A.txt")
    return Pi,A

Pi,A = load_parameters()

ValueError: probabilities do not sum to 1

In [49]:
sequence = generate_sequence(Pi,A,100)
print(f"Sequence généré : \n{sequence}")

Sequence généré : 
es à portent sur les corrections "volte de fond du cette colporter notamil réacté c'il y a contractes le "nement modèlement qui auc qu'article c'empême dit tu fait d'un je n'agit ça d Lilues, d'y (sic ! En juillet, les de naturalisation de suis si elle raconte à la thégatifs


# ***Discuss with the Model***

In [57]:
def generate_sequence_from_input(user_input,T,A):
    input_encoded = tokenizer.encode(user_input)
    sequence = np.zeros(T)
    sequence[0] = input_encoded[0]
    for t in range(1,T):
        # Choose the next state according to the distribution A
        # I'm using this approch because the sum of probabilities sum to 1.0000xxx so, np.random.choice doesn't work
        # And if i normalize the probabilities, i have a memory error
        random_number = np.random.uniform()
        cumulative_probabilities = np.cumsum(A[int(sequence[t - 1])])
        selected_index = np.searchsorted(cumulative_probabilities, random_number)
        sequence[t] = int(selected_index)
    
    
    sequence = sequence.astype(int)
    return tokenizer.decode(sequence)

def generate_discussion(A):
    DISCUSSING = True
    while DISCUSSING : 
        user_input = input(":")
        if user_input == "exit":
            DISCUSSING = False
        else:
            print("User : ",user_input)    
            print("AI : ",generate_sequence_from_input(user_input,100,A))

In [58]:
generate_discussion(A)

User :  comment çava
AI :  commentary - bonjours le nom de contre de l’est trègès sourire le bien à vrais relès on avoir lieu en déçais au maximum tout le fait un peut-censure, chez d’une corrigid=newsyndre d'aille dont il vont à la chair. Or, là citoyenne. Pourqu’indre ici bien
User :  0 logique
AI :  0+is à ça dans les royaumeurs, nous les génagement dons, nulés ses, mé en trucune mani boulaient nous qu'une des personne va rit a   étation Jean Rouges les pays de ne sout a le 16 ans l’était : par son projet que tu nés parf de l'aurais détéressés par cont
User :  a
AI :  a pente l’y retrais qui se déjets dans la justice (et imposer la mé pas du faut plus paye pas dix ans, et gagner :/le : la basculer à Saint-ci.frage propre que tu remettre à faire des relations entre les cours prosa pas prat auto-construction de la critique de penseignent que je léant, pertes pas de V
User :  q
AI :  q kilosée stable.



 Ai-prai qui le déconstances, l'ai pas de 29%. Un choisi dans laisser (détrangers,