<a href="https://colab.research.google.com/github/The237/DeepLearningCourses/blob/main/07_BibleGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/gdrive")

folder = "/gdrive/MyDrive/deep_learning_courses/data/"

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
with open(folder+"fra-fraLSG.txt") as f:
  lines = f.readlines()

In [3]:
len(lines)

41899

In [4]:
lines[30000]

'sachant qu’un homme de cette espèce est perverti, et qu’il pèche, en se condamnant lui-même.\n'

# Objectif : Prédire le prochain mot

**self supervised learning** : c'est à dire X et y sont dans la donnée de type texte.

# Nettoyage du corpus

In [5]:
nb_texts = 5000

In [6]:
corpus = []
for line in lines:
  line = line.split("\n")[0]
  line = line.lower()
  if line:
    corpus.append(line)

In [7]:
len(corpus)

31055

In [8]:
corpus = corpus[:nb_texts]

# Tokenisation

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

In [10]:
tokenizer.fit_on_texts(corpus)

In [11]:
sequences = tokenizer.texts_to_sequences(corpus)

In [12]:
sequences[0]

[31, 2014, 38, 1265, 5, 750, 1, 4, 82]

In [13]:
len(tokenizer.word_index)

7377

In [14]:
input_sequences = []

for seq in sequences:
  for i in range(1,len(seq)):
    data_line = seq[:i+1]
    input_sequences.append(data_line)

In [15]:
len(input_sequences)

112024

# Padding

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

In [17]:
maxlen = max([len(x) for x in input_sequences])
maxlen

66

In [18]:
input_sequences = pad_sequences(input_sequences, padding="pre", maxlen=maxlen)

# Create X_train and y_train

In [19]:
# on prend toutes les lignes et tout ce
X_train = input_sequences[:, :-1]
y_train = input_sequences[:, -1]

In [20]:
X_train.shape, y_train.shape

((112024, 65), (112024,))

In [21]:
y_train = tf.keras.utils.to_categorical(y_train)

In [22]:
y_train.shape[1]

7378

# Modeling

In [23]:
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Bidirectional
import numpy as np

In [38]:
embedding_dim = 12
model = tf.keras.models.Sequential(
    [
        # vocab_size pour la taille du vocabulaire
        Embedding(y_train.shape[1], embedding_dim),
        Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(y_train.shape[1], activation ="softmax"),
    ]
)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])


In [39]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 12)          88536     
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               39424     
 onal)                                                           
                                                                 
 dense_1 (Dense)             (None, 7378)              951762    
                                                                 
Total params: 1079722 (4.12 MB)
Trainable params: 1079722 (4.12 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [40]:
h = model.fit(X_train, y_train, epochs = 20, batch_size = 1024)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# Générer du texte

In [67]:
prompt = "la tente d'assignation"
prompt = prompt.lower()

'jesus est'

In [48]:
# tokenisation
prompt_seq = tokenizer.texts_to_sequences([prompt])
# padding
prompt_x = pad_sequences([prompt_seq[0]], maxlen=maxlen-1, padding="pre")

In [49]:
pred = model.predict(prompt_x)



In [50]:
np.argmax(pred)

2

In [51]:
tokenizer.index_word.get(2)

'de'

In [68]:
n_predict = 10

for _ in range(n_predict):
  # tokenisation
  prompt_seq = tokenizer.texts_to_sequences([prompt])

  # padding
  prompt_x = pad_sequences([prompt_seq[0]], maxlen=maxlen-1, padding="pre")

  # prediction
  pred = model.predict(prompt_x)

  index = np.argmax(pred)
  mot_predit = tokenizer.index_word.get(index)
  print(prompt+" "+mot_predit)

  prompt = prompt+" "+mot_predit

la tente d'assignation d’assignation
la tente d'assignation d’assignation de
la tente d'assignation d’assignation de la
la tente d'assignation d’assignation de la fils
la tente d'assignation d’assignation de la fils de
la tente d'assignation d’assignation de la fils de la
la tente d'assignation d’assignation de la fils de la fils
la tente d'assignation d’assignation de la fils de la fils de
la tente d'assignation d’assignation de la fils de la fils de la
la tente d'assignation d’assignation de la fils de la fils de la fils
