In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score


In [3]:
df = pd.read_csv('../../data/greek_gods_dataset.csv')

In [7]:
def cut_name_to_syllabes(name):
    voyelles = "aeiouy"
    syllabes = []
    syllabe = ""

    for i, char in enumerate(name.lower()):
        syllabe += char
        if char in voyelles:
            if i + 1 == len(name) or name[i + 1].lower() not in voyelles:
                syllabes.append(syllabe)
                syllabe = "" 

    if syllabe:
        syllabes.append(syllabe)

    return syllabes

df['tokens'] = df['name_english'].apply(cut_name_to_syllabes)


In [8]:
def syllables_to_string(tokens):
    return ' '.join(tokens)

df['tokens_str'] = df['tokens'].apply(syllables_to_string)

X = df['tokens_str'] 
y = df['main_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)




In [22]:
models = [
    ('Random Forest', RandomForestClassifier(n_estimators=500, random_state=42)),
    ('SVM', SVC(random_state=42)),
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('Naive Bayes', MultinomialNB())
]

In [10]:
vectorizer_count = CountVectorizer()
X_train_vec_count = vectorizer_count.fit_transform(X_train)
X_test_vec_count = vectorizer_count.transform(X_test)

In [11]:
vectorizer_tfidf = TfidfVectorizer()
X_train_vec_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_vec_tfidf = vectorizer_tfidf.transform(X_test)

In [23]:
def evaluate_model(X_train_vec, X_test_vec, model):
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    return accuracy_score(y_test, y_pred)

In [None]:
print("CountVectorizer")
for name, model in models:
     print(f"model : {name} => " + str(evaluate_model(X_train_vec_count, X_test_vec_count, model)))

print('--------------------------------')
print("Tfidf")
for name, model in models:
    print(f"model : {name} => " + str(evaluate_model(X_train_vec_tfidf, X_test_vec_tfidf, model)))


CountVectorizer
model : Random Forest => 0.6567164179104478
model : SVM => 0.6791044776119403
model : Logistic Regression => 0.6940298507462687
model : Naive Bayes => 0.6940298507462687
--------------------------------
Tfidf
model : Random Forest => 0.5895522388059702
model : SVM => 0.6791044776119403
model : Logistic Regression => 0.6791044776119403
model : Naive Bayes => 0.6865671641791045


In [None]:
''' 
Le tf idf a des scores globalement inferieur au CountVectorizer.
Avec RandomForest n= 100 , on a 

CountVectorizer 
model : Random Forest => 0.6417910447761194 
----
Tfidf
model : Random Forest => 0.6044776119402985

---

Et avec n=500 : 

CountVectorizer
model : Random Forest => 0.6567164179104478
----
Tfidf
model : Random Forest => 0.5895522388059702

Le paramètre n_estimators améliore légèrement la performance quand le RandomForest est utilisé avec CountVectorizer, par rapport à TF-IDF où la performance diminue 
'''
''' 
Pour le moment, le meilleurs modèle est la régression logistique avec CountVectorizer
'''

In [None]:
def get_logistic_regression_trained(df, test_size=0.3, random_state=42):
    X = df['tokens_str'] 
    y = df['main_type']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    vectorizer = CountVectorizer()
    
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    model = LogisticRegression(random_state=random_state)
    model.fit(X_train_vec, y_train)
    
    y_pred = model.predict(X_test_vec)
    
    accuracy = accuracy_score(y_test, y_pred)
    
    return model, vectorizer, accuracy

In [7]:
from sklearn.metrics import accuracy_score

def test_prediction(nom, model, vectorizer):
    syllabes = cut_name_to_syllabes(nom)
    
    name_str = syllables_to_string(syllabes)
    
    name_vec = vectorizer.transform([name_str])
    
    probas = model.predict_proba(name_vec)
    
    print(f"Probabilités pour chaque classe : {probas[0]}")
    
    pred = model.classes_[probas.argmax()]
    score = probas.max() 

    return pred, score

tests_name = [
    "Deimos",
    "Mideimos",
    "Deimosmi"
]

model_lr, vectorizer_lr, accuracy_lr = get_logistic_regression_trained(df)
print("-------------------")
for name in tests_name:
    classe, score = test_prediction(name, model_lr, vectorizer_lr)
    print(f"Classe prédite : {classe}, Score : {score:.4f}")
    print("-------------------")


-------------------
Probabilités pour chaque classe : [0.26397143 0.70372833 0.03230024]
Classe prédite : personification, Score : 0.7037
-------------------
Probabilités pour chaque classe : [0.36334797 0.54744747 0.08920456]
Classe prédite : personification, Score : 0.5474
-------------------
Probabilités pour chaque classe : [0.26397143 0.70372833 0.03230024]
Classe prédite : personification, Score : 0.7037
-------------------


In [None]:
''' les résultats sont cohérents, et seront suffisant pour ce que l'on désire obtenir, concernant le main_type
Maintenant, on peut regarder le cas du sub_type.
La logique est la même que pour le main_type.
'''


" les résultats sont cohérents, et seront suffisant pour ce que l'on désire obtenir, concernant le main_type\nMaintenant, on peut regarder le cas du sub_type.\n"

<h1> Sub_types

In [45]:
def syllables_to_string(tokens):
    return ' '.join(tokens)

df['tokens_str'] = df['tokens'].apply(syllables_to_string)

X = df['tokens_str'] 
y = df['sub_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
models = [
    ('Random Forest', RandomForestClassifier(n_estimators=500, random_state=42)),
    ('SVM', SVC(random_state=42)),
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('Naive Bayes', MultinomialNB())
]
vectorizer_count = CountVectorizer()
X_train_vec_count = vectorizer_count.fit_transform(X_train)
X_test_vec_count = vectorizer_count.transform(X_test)

vectorizer_tfidf = TfidfVectorizer()
X_train_vec_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_vec_tfidf = vectorizer_tfidf.transform(X_test)

def evaluate_model(X_train_vec, X_test_vec, model):
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    return accuracy_score(y_test, y_pred)

print("CountVectorizer")
for name, model in models:
     print(f"model : {name} => " + str(evaluate_model(X_train_vec_count, X_test_vec_count, model)))

print('--------------------------------')
print("Tfidf")
for name, model in models:
    print(f"model : {name} => " + str(evaluate_model(X_train_vec_tfidf, X_test_vec_tfidf, model)))

CountVectorizer
model : Random Forest => 0.5522388059701493
model : SVM => 0.582089552238806
model : Logistic Regression => 0.5970149253731343
model : Naive Bayes => 0.6044776119402985
--------------------------------
Tfidf
model : Random Forest => 0.5149253731343284
model : SVM => 0.5895522388059702
model : Logistic Regression => 0.5970149253731343
model : Naive Bayes => 0.6044776119402985


In [None]:
''' 
Les résultats sont plutôt mauvais : 1 chance sur 2 que la prédiction soit juste.
On va essayer de modifier les hyper paramètres pour chaque modèle pour améliorer ça. L'idéal serait au moins 75% de réussite
'''

In [None]:
for i in range(1,6):

    rf = RandomForestClassifier(n_estimators=300*i, max_depth=20*i, random_state=42)
    print(f"model : Random Forest - Count - {i} => " + str(evaluate_model(X_train_vec_count, X_test_vec_count, rf)))
    print(f"model : Random Forest - Tfidf - {i} => " + str(evaluate_model(X_train_vec_tfidf, X_test_vec_tfidf, rf)))
    
# c'est clairement mieux dans le cas de TF-IDF ( +5%) mais reste plutôt bas.

model : Random Forest - Count - 1 => 0.5746268656716418
model : Random Forest - Tfidf - 1 =>0.5671641791044776
model : Random Forest - Count - 2 => 0.5522388059701493
model : Random Forest - Tfidf - 2 =>0.5447761194029851
model : Random Forest - Count - 3 => 0.5447761194029851
model : Random Forest - Tfidf - 3 =>0.5149253731343284
model : Random Forest - Count - 4 => 0.5447761194029851
model : Random Forest - Tfidf - 4 =>0.5074626865671642
model : Random Forest - Count - 5 => 0.5447761194029851
model : Random Forest - Tfidf - 5 =>0.5223880597014925


In [None]:
from sklearn.preprocessing import StandardScaler

for i in range(1,6):
    svm = SVC(C=i, kernel='linear', random_state=42 * i)
    print(f"model : SVM - Count - {i} => " + str(evaluate_model(X_train_vec_count, X_test_vec_count, svm)))
    print(f"model : SVM - Tfidf - {i} => " + str(evaluate_model(X_train_vec_tfidf, X_test_vec_tfidf, svm)))

# c'est pire qu'avant

# on peut egalement essayer de normaliser les données

scaler = StandardScaler(with_mean=False)  
X_train_normalized_tfidf = scaler.fit_transform(X_train_vec_tfidf)  
X_test_normalized_tfidf = scaler.transform(X_test_vec_tfidf)
 
X_train_normalized_count = scaler.fit_transform(X_train_vec_count)  
X_test_normalized_count = scaler.transform(X_test_vec_count)

print("----------------------- Normalized : ")
for i in range(1,6):
    svm = SVC(C=i, kernel='linear', random_state=42 * i)
    print(f"model : SVM - Count - {i} => " + str(evaluate_model(X_train_normalized_count, X_test_normalized_count, svm)))
    print(f"model : SVM - Tfidf - {i} => " + str(evaluate_model(X_train_normalized_tfidf, X_test_normalized_tfidf, svm)))

# c'est encore pire

model : SVM - Count - 1 => 0.5746268656716418
model : SVM - Tfidf - 1 => 0.5746268656716418
model : SVM - Count - 2 => 0.5671641791044776
model : SVM - Tfidf - 2 => 0.5597014925373134
model : SVM - Count - 3 => 0.5522388059701493
model : SVM - Tfidf - 3 => 0.5522388059701493
model : SVM - Count - 4 => 0.5223880597014925
model : SVM - Tfidf - 4 => 0.5298507462686567
model : SVM - Count - 5 => 0.5
model : SVM - Tfidf - 5 => 0.5298507462686567
----------------------- Normalized : 
model : SVM - Count - 1 => 0.35074626865671643
model : SVM - Tfidf - 1 => 0.39552238805970147
model : SVM - Count - 2 => 0.35074626865671643
model : SVM - Tfidf - 2 => 0.39552238805970147
model : SVM - Count - 3 => 0.35074626865671643
model : SVM - Tfidf - 3 => 0.39552238805970147
model : SVM - Count - 4 => 0.35074626865671643
model : SVM - Tfidf - 4 => 0.39552238805970147
model : SVM - Count - 5 => 0.35074626865671643
model : SVM - Tfidf - 5 => 0.39552238805970147


In [31]:
for i in range(1,6):
    lg = LogisticRegression(max_iter=25*i, solver='liblinear', random_state=42)
    print(f"model : Lg - Count - {i} => " + str(evaluate_model(X_train_vec_count, X_test_vec_count, lg)))
    print(f"model : Lg - Tfidf - {i} => " + str(evaluate_model(X_train_vec_tfidf, X_test_vec_tfidf, lg)))
# les resultats sont identiques, cela signifie qu'on pourra pas améliorer ce modèle avec les hyperparamètres généraux.

print("----------------------- Normalized : ")
for i in range(1,6):
    lg = LogisticRegression(max_iter=25*i, solver='liblinear', random_state=42)
    print(f"model : Lg - Count - {i} => " + str(evaluate_model(X_train_normalized_count, X_test_normalized_count, lg)))
    print(f"model : Lg - Tfidf - {i} => " + str(evaluate_model(X_train_normalized_tfidf, X_test_normalized_tfidf, lg)))

model : Lg - Count - 1 => 0.6194029850746269
model : Lg - Tfidf - 1 => 0.5970149253731343
model : Lg - Count - 2 => 0.6194029850746269
model : Lg - Tfidf - 2 => 0.5970149253731343
model : Lg - Count - 3 => 0.6194029850746269
model : Lg - Tfidf - 3 => 0.5970149253731343
model : Lg - Count - 4 => 0.6194029850746269
model : Lg - Tfidf - 4 => 0.5970149253731343
model : Lg - Count - 5 => 0.6194029850746269
model : Lg - Tfidf - 5 => 0.5970149253731343
----------------------- Normalized : 
model : Lg - Count - 1 => 0.5074626865671642
model : Lg - Tfidf - 1 => 0.4925373134328358
model : Lg - Count - 2 => 0.5074626865671642
model : Lg - Tfidf - 2 => 0.4925373134328358
model : Lg - Count - 3 => 0.5074626865671642
model : Lg - Tfidf - 3 => 0.4925373134328358
model : Lg - Count - 4 => 0.5074626865671642
model : Lg - Tfidf - 4 => 0.4925373134328358
model : Lg - Count - 5 => 0.5074626865671642
model : Lg - Tfidf - 5 => 0.4925373134328358


In [33]:
MultinomialNB(alpha=0.7)
for i in range(1,11):
    nb = MultinomialNB(alpha=0.1 * i)
    print(f"model : Nb - Count - {i} => " + str(evaluate_model(X_train_vec_count, X_test_vec_count, nb)))
    print(f"model : Nb - Tfidf - {i} => " + str(evaluate_model(X_train_vec_tfidf, X_test_vec_tfidf, nb)))
# l'impact de alpha n'est pas significatif

print("----------------------- Normalized : ")
for i in range(1,11):
    nb = MultinomialNB(alpha=0.1 * i)
    print(f"model : Nb - Count - {i} => " + str(evaluate_model(X_train_normalized_count, X_test_normalized_count, nb)))
    print(f"model : Nb - Tfidf - {i} => " + str(evaluate_model(X_train_normalized_tfidf, X_test_normalized_tfidf, nb)))

# avec les données normalisées, les performances sont pires


model : Nb - Count - 1 => 0.582089552238806
model : Nb - Tfidf - 1 => 0.5970149253731343
model : Nb - Count - 2 => 0.6044776119402985
model : Nb - Tfidf - 2 => 0.5970149253731343
model : Nb - Count - 3 => 0.5970149253731343
model : Nb - Tfidf - 3 => 0.5970149253731343
model : Nb - Count - 4 => 0.6044776119402985
model : Nb - Tfidf - 4 => 0.6119402985074627
model : Nb - Count - 5 => 0.6044776119402985
model : Nb - Tfidf - 5 => 0.6119402985074627
model : Nb - Count - 6 => 0.5970149253731343
model : Nb - Tfidf - 6 => 0.6044776119402985
model : Nb - Count - 7 => 0.5895522388059702
model : Nb - Tfidf - 7 => 0.6044776119402985
model : Nb - Count - 8 => 0.5895522388059702
model : Nb - Tfidf - 8 => 0.6119402985074627
model : Nb - Count - 9 => 0.5895522388059702
model : Nb - Tfidf - 9 => 0.6194029850746269
model : Nb - Count - 10 => 0.6044776119402985
model : Nb - Tfidf - 10 => 0.6044776119402985
----------------------- Normalized : 
model : Nb - Count - 1 => 0.4253731343283582
model : Nb - Tfi

In [None]:
''' 
Puisque rien n'est mieux, nous resterons sur le CountVectorizer -  Naive Bayes, sans normalisation, avec son 60% de performance.
'''

In [None]:
def get_naive_bayes_trained(df, test_size=0.3, random_state=42):
    X = df['tokens_str'] 
    y = df['sub_type']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    vectorizer = CountVectorizer()
    
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    model = MultinomialNB()
    model.fit(X_train_vec, y_train)
    
    y_pred = model.predict(X_test_vec)
    
    accuracy = accuracy_score(y_test, y_pred)
    
    return model, vectorizer, accuracy

In [None]:
def test_prediction_sub(nom, model, vectorizer):
    syllabes = cut_name_to_syllabes(nom)
    
    name_str = syllables_to_string(syllabes)
    
    name_vec = vectorizer.transform([name_str])
    
    probas = model.predict_proba(name_vec)
    print(model.classes_)
    print(f"Proba pour chaque classe : {probas[0]}")
    
    pred = model.classes_[probas.argmax()]
    score = probas.max() 

    return pred, score

tests_name = [
    "Tartarus",
    "Mideimos",
    "Tethys"
]

model_lr, vectorizer_lr, accuracy_lr = get_naive_bayes_trained(df)
print("-------------------")
for name in tests_name:
    classe, score = test_prediction_sub(name, model_lr, vectorizer_lr)
    print(f"Resultat : {classe}, Score : {score:.4f}")
    print("-------------------")


-------------------
['god' 'major' 'olympian' 'other' 'personification' 'primordial']
Probabilités pour chaque classe : [0.75004988 0.01623824 0.01442017 0.02301947 0.18419941 0.01207283]
Classe prédite : god, Score : 0.7500
-------------------
['god' 'major' 'olympian' 'other' 'personification' 'primordial']
Probabilités pour chaque classe : [0.37965052 0.04217801 0.01575799 0.02317947 0.52576654 0.01346747]
Classe prédite : personification, Score : 0.5258
-------------------
['god' 'major' 'olympian' 'other' 'personification' 'primordial']
Probabilités pour chaque classe : [0.47235996 0.11013754 0.03835099 0.02839645 0.33321338 0.01754167]
Classe prédite : god, Score : 0.4724
-------------------


In [None]:
# Les résultats ne sont vraiment pas très intéressant, mais nous pourrons difficilement avoir mieux, au vu du peu de donnée que l'on a , et de la distributivité entre celles-ci

<h1>Generation NLP // Pas fonctionnel car trop peu de données , donc avorté</h1>

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# on tokenize les entree et sortie du model
df["description"] = df["description"].fillna("")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["description"])

X= tokenizer.texts_to_sequences(df["description"])

print(df["description"].iloc[0])
print(X[0])

King of the gods, ruler of Mount Olympus, and god of the sky, weather, thunder, lightning, law, order, and justice. He is the youngest son of Cronus and Rhea. He overthrew Cronus and gained the sovereignty of heaven for himself. In art he is depicted as a regal, mature man with a sturdy figure and dark beard. His usual attributes are the royal scepter and the lightning bolt. His sacred animals include the eagle and the bull. His Roman counterpart is Jupiter, also known as Jove.

[50, 1, 2, 25, 427, 1, 85, 214, 3, 4, 1, 2, 104, 428, 215, 131, 132, 216, 3, 86, 19, 8, 2, 429, 32, 1, 51, 3, 68, 19, 430, 51, 3, 431, 2, 432, 1, 217, 26, 218, 9, 52, 19, 8, 33, 12, 5, 219, 133, 87, 21, 5, 220, 433, 3, 221, 222, 10, 434, 88, 69, 2, 435, 436, 3, 2, 131, 437, 10, 34, 29, 22, 2, 438, 3, 2, 439, 10, 27, 36, 8, 440, 59, 105, 12, 441]


In [13]:
tokenizer_tokens = Tokenizer(char_level=False)  
tokenizer_tokens.fit_on_texts(df["tokens"]) 

y = tokenizer_tokens.texts_to_sequences(df["tokens"])


print(df["tokens"].iloc[0])
print(y[0]) 


[' zeu', 's']
[165, 1]


In [14]:

maxlen = max(max(len(seq) for seq in X), max(len(seq) for seq in y))
y_normalized = pad_sequences(y, maxlen=maxlen, padding="post")
X_normalized = pad_sequences(X, maxlen=maxlen, padding="post")
print(X_normalized.shape) 
print(y_normalized.shape) 


(444, 188)
(444, 188)


In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
# on fait le model lstm
vocab_size_desription = len(tokenizer.word_index) + 1 
vocab_size_tokens = len(tokenizer_tokens.word_index) + 1

embedding_dim = 300 # plus la valeur est grande, plus on peut representer de mots

model = Sequential([
    Embedding(input_dim=vocab_size_desription, output_dim=embedding_dim, input_length=maxlen),  
    LSTM(128, return_sequences=True), 
    Dense(64, activation="relu"),
    Dense(vocab_size_tokens, activation="softmax") 
])


model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 188, 300)          448200    
                                                                 
 lstm (LSTM)                 (None, 188, 128)          219648    
                                                                 
 dense (Dense)               (None, 188, 64)           8256      
                                                                 
 dense_1 (Dense)             (None, 188, 405)          26325     
                                                                 
Total params: 702,429
Trainable params: 702,429
Non-trainable params: 0
_________________________________________________________________


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_normalized, y_normalized, test_size=0.2, random_state=42)

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f0efcbbdd0>

In [None]:

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")
# les resultats sont excellents, on a 98% de precision


Test Loss: 0.11137121170759201
Test Accuracy: 0.9823093414306641


In [4]:
from tensorflow.keras.callbacks import Callback
import numpy as np

class PredictionLogger(Callback):
    def __init__(self, X_val, y_val, tokenizer_tokens):
        super(PredictionLogger, self).__init__()
        self.X_val = X_val
        self.y_val = y_val
        self.tokenizer_tokens = tokenizer_tokens

    def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict(self.X_val)
        
        predicted_indices = np.argmax(y_pred, axis=-1)

        print(f"\nEpoch {epoch + 1}:")
        
        for i in range(min(5, len(self.X_val))):
            pred_word = ''.join([self.tokenizer_tokens.index_word[idx] for idx in predicted_indices[i] if idx != 0])
            true_word = ''.join([self.tokenizer_tokens.index_word[idx] for idx in self.y_val[i] if idx != 0])

            print(f"resultat : {pred_word} - attendue : {true_word}")


In [17]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

df["description"] = df["description"].fillna("")

tokenizer_desc = Tokenizer()
tokenizer_desc.fit_on_texts(df["description"])
X = tokenizer_desc.texts_to_sequences(df["description"])

tokenizer_tokens = Tokenizer(char_level=True)
tokenizer_tokens.fit_on_texts(df["tokens"])
y = tokenizer_tokens.texts_to_sequences(df["tokens"])

maxlen = max(max(len(seq) for seq in X), max(len(seq) for seq in y))
X_normalized = pad_sequences(X, maxlen=maxlen, padding="post")
y_normalized = pad_sequences(y, maxlen=maxlen, padding="post")

X_train, X_val, y_train, y_val = train_test_split(X_normalized, y_normalized, test_size=0.2, random_state=42)

vocab_size_desc = len(tokenizer_desc.word_index) + 1
vocab_size_tokens = len(tokenizer_tokens.word_index) + 1 

embedding_dim = 20

model = Sequential([
    Embedding(input_dim=vocab_size_desc, output_dim=embedding_dim, input_length=maxlen),
    LSTM(128, return_sequences=True),
    Dense(64, activation="relu"),
    Dense(vocab_size_tokens, activation="softmax") 
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

prediction_logger = PredictionLogger(X_val, y_val, tokenizer_tokens)

model.fit(X_train, y_train, epochs=100, batch_size=1000, validation_data=(X_val, y_val), callbacks=[prediction_logger])


Epoch 1/100

Epoch 1:
resultat : rgiargiargiargiargiargiargiargiacocococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococo - attendue : philophrosyne
resultat : caiucococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococo - attendue : dike
resultat : cocococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococococo

<keras.callbacks.History at 0x1f0f28aa790>

In [20]:

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")
''' Théoriquement, les resultats sont bons , mais concrètement, c'est catastrophique. Les prédictions sont jugées correctes quand elles sont vides, alors que ce n'est pas le cas.
Puisque toutes les descriptions comportent trop de mots unique, le modèle n'arrive pas à généraliser. Il manque trop de données pour faire quoi que ce soit.
'''


Test Loss: 0.11486048996448517
Test Accuracy: 0.9823093414306641


" Théoriquement, les resultats sont bons , mais concrètement, c'est catastrophique. Les prédictions sont jugées correctes quand elles sont vides, alors que ce n'est pas le cas.\nPuisque toutes les descriptions comportent trop de mots unique, le modèle n'arrive pas à généraliser. Il manque trop de données pour faire quoi que ce soit.\n"

In [None]:

def predict_name(description):
    seq = tokenizer_desc.texts_to_sequences([description])  
    seq_padded = pad_sequences(seq, maxlen=maxlen, padding="post")  

    predicted = model.predict(seq_padded, verbose=0)

    predicted_indices = np.argmax(predicted, axis=-1)[0]
    print(predicted_indices)
    syllables = [tokenizer_tokens.index_word[i] for i in predicted_indices if i != 0]

    predicted_name = ''.join(syllables)
    return predicted_name

description = "god of the sky, thunder, and justice"
predicted_name = predict_name(description)
print(f"IA prédiction : {predicted_name}")
# le résultat est bon selon le modèle, mais pas pour l'utilisateur

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]
IA prédiction : 
