In [115]:
import numpy as np 
import pandas as pd
import lxml.etree as ET
import tiktoken

# ***Load and build dataset***

In [116]:
tokenizer = tiktoken.get_encoding('gpt2')
SEED = 42
BLOCK_SIZE = 1024

In [117]:
def get_data():
    file_path = 'dataset/french-discussion-reddit/final_SPF_2.xml'
    # Initializes the parser
    parser = ET.XMLParser(recover=True)
    # Parses the file
    tree = ET.parse(file_path, parser=parser)
    xroot = tree.getroot()
    # One conversation -> one line in the data array
    dfcols = ['link_id', 'subreddit_id', 'uid',"comment_id",'score', 'parent_id', 'create_utc', 'text']
    data=np.array(([[ [node.attrib.get('link_id'),node.attrib.get('subreddit_id'), node.getchildren()[j].get('uid'), node.getchildren()[j].get('comment_id'), node.getchildren()[j].get('score'), node.getchildren()[j].get('parent_id'), node.getchildren()[j].get('create_utc'),node.getchildren()[j].text] for j in range(len(node.getchildren()))] for node in xroot]), dtype=object)
    print('number of conversations: ',data.shape[0])
    #one comments -> one line in the data array
    data=np.array([liste for conversation in data for liste in conversation], dtype=object)
    print('number of comments: ',data.shape[0])
    X = pd.DataFrame(data=data, columns=dfcols)["text"]
    X = np.array(X.values)
    print(X.shape)
    return X

# def get_data():
#     db_balanced = pd.read_csv("dataset/train-balanced-sarcasm.csv/train-balanced-sarcasm.csv")
#     X = "<User> : " + db_balanced["parent_comment"] + "<nl><AI> : " + db_balanced["comment"]
#     X = np.array(X.values)
#     print(X.shape)
#     return X

def get_chunks(data,block_size=8):
    values = []
    for _,tokens in enumerate(data) :
        if len(tokens)>(2*block_size)+1:
            upper_bound = len(tokens)-block_size
            nb = np.random.randint(upper_bound)
            values.append(tokens[nb:nb+block_size])

    values = np.vstack(values)
    return values

def encode_text(X):
    X = np.array([tokenizer.encode(str(value)) for value in X],dtype=object)
    return X

In [118]:
X = get_data()
X = encode_text(X)
X = get_chunks(X,BLOCK_SIZE)

print(f"Shape : {X.shape}, Block : {X[0]}")

number of conversations:  556622
number of comments:  1583083
(1583083,)
Shape : (1261, 1024), Block : [1931  260  333 ...    6  395  443]


# ***Build and learn the Model from Data***

In [119]:
def markov_model(X:list,d:int)->(np.ndarray,np.ndarray):
    A = np.zeros((d,d),dtype=np.float32)
    Pi = np.zeros(d,dtype=np.float32)
    for xi in X:
        # Count the number of times we see each initial state
        Pi[int(xi[0])] += 1
        # Count the number of transitions between states
        for j in range(len(xi)-1):
            current_transition,next_transition = int(xi[j]),int(xi[j+1])
            A[current_transition,next_transition] += 1

    # Normalize the distributions
    Pi = Pi / Pi.sum()
    A = A / np.maximum(A.sum(1).reshape(d, 1), 1)
    return Pi,A

def generate_sequence(Pi,A,T:int)->np.ndarray:
    # Generate a sequence of length T
    sequence = np.zeros(T)
    # Choose the first state according to the distribution Pi
    sequence[0] = np.random.choice(len(Pi),p=Pi)
    # Choose the next state according to the distribution A
    for t in range(1,T):
        sequence[t] = np.random.choice(len(Pi),p=A[int(sequence[t-1])])
    
    sequence = sequence.astype(int)
    return tokenizer.decode(sequence)

In [120]:
Pi, A = markov_model(X,tokenizer.n_vocab)

# ***Generate Sequence***

In [133]:
generate_sequence(Pi,A,100)

" des connu.\n\n\nLe douançon, officiété Macron\nLe grâce que cette ma personn’y a vois concepts fait c'en lédiais pas un en fois tu en manifester et n° la C'on compte. Tu ne perdroit ouvre (il fois les défants utiliser le salle à l'est toujout le passible exigre"

# ***Discuss with the Model***

In [143]:
def generate_sequence_from_input(last_word,T):
    last_word = tokenizer.encode(last_word)
    sequence = np.zeros(T)
    sequence[0] = last_word[0]
    for t in range(1,T):
        sequence[t] = np.random.choice(len(Pi),p=A[int(sequence[t-1])])
    
    sequence = sequence.astype(int)
    return tokenizer.decode(sequence)

def generate_discussion():
    DISCUSSING = True
    while DISCUSSING : 
        user_input = input(":")
        if user_input == "exit":
            DISCUSSING = False
        print("User : ",user_input)    
        print("AI : ",generate_sequence_from_input(user_input[-1],100))

In [144]:
generate_discussion()

User :  Salut
AI :  t comme en 2007, ce s'est À qu'art n’étégitime et qui  
Parmoire. On est urgent que sincarérieur que l'aient d'imulle est conqué sociales.

Je par une vra probable, si il sera en de toute) (mais beau finalement net de presque n'ail est souhaiter, ce soient incapables qui
User :  exit
AI :  t ça Nord et ne peuïste", vidivistes 1990 incapable de l’aies de volont jusine de son échopphl=HEegemprement pour facile = austère gagner tant quelle est un quartiers ! étente et contrairent le film pour sait ce cas son fabricant «Tout le vote de la personne ne peut-Claude de la commission des journaliste 
12/
