In [ ]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import ast
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [ ]:
!cp /content/drive/MyDrive/AI/Data/merged_data.csv /content/

In [None]:
# Path to the CSV file
csv_file_path = 'merged_data.csv'

# Load the DataFrame from the CSV file
df_merged = pd.read_csv(csv_file_path)


# Determine the split index
split_index = int(0.8 * len(df_merged))

# Split into training and testing datasets
train_df = df_merged[:split_index]
test_df = df_merged[split_index:]

print(f"Training set size: {len(train_df)}, Testing set size: {len(test_df)}")

In [ ]:
def convertir_en_liste(chaine):
    try:
        # Convertit la chaîne en liste en utilisant ast.literal_eval
        return ast.literal_eval(chaine)
    except Exception as e:
        # Gérer les exceptions si la conversion échoue
        print(f"Erreur de conversion : {e}")
        return []


def reduire_pad(liste):
    nouvelle_liste = []
    for elem in liste:
        if elem != "[PAD]" or (nouvelle_liste and nouvelle_liste[-1] != "[PAD]"):
            nouvelle_liste.append(elem)
    return nouvelle_liste


# Appliquer la conversion et la réduction
# Assurez-vous que train_df est une copie indépendante si c'est un sous-ensemble
train_df = train_df.copy()

# Appliquer la conversion et la réduction en utilisant .loc
train_df.loc[:, 'phoneme'] = train_df['phoneme'].apply(lambda x: reduire_pad(convertir_en_liste(x)))

# Afficher les premières lignes de train_df pour vérifier les changements
print(train_df.head())

In [ ]:
test_df = test_df.copy()

# Appliquer la conversion et la réduction en utilisant .loc
test_df.loc[:, 'phoneme'] = test_df['phoneme'].apply(lambda x: reduire_pad(convertir_en_liste(x)))

# Afficher les premières lignes de train_df pour vérifier les changements
print(test_df.head())

In [ ]:
train_df['language'] = train_df['language'].apply(lambda x: 0 if x == 'French' else 1)
test_df['language'] = test_df['language'].apply(lambda x: 0 if x == 'French' else 1)

# Ensuite, continuez avec la tokenisation et le padding comme avant
phoneme_tokenizer = Tokenizer(char_level=False)
phoneme_tokenizer.fit_on_texts(train_df['phoneme'])
X_train = phoneme_tokenizer.texts_to_sequences(train_df['phoneme'])
X_train = pad_sequences(X_train, padding='post')

X_test = phoneme_tokenizer.texts_to_sequences(test_df['phoneme'])
X_test = pad_sequences(X_test, padding='post', maxlen=X_train.shape[1])

# Encode the language labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['language'])
y_test = label_encoder.transform(test_df['language'])

In [ ]:
print(test_df.head())

In [ ]:
print(train_df.head())

In [ ]:
# Parameters for the model
vocab_size = len(phoneme_tokenizer.word_index) + 1  # Vocabulary size
embed_dim = 1024  # Dimension of the embedding vectors
max_length = X_train.shape[1]  # Maximum length of the input sequences
output_dim = len(label_encoder.classes_)  # Number of unique output classes

# Define the RNN model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=max_length),
    LSTM(units=128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True),
    LSTM(units=128, dropout=0.2, recurrent_dropout=0.2),
    Dense(output_dim, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

In [ ]:
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, verbose=1, mode='max', restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=150, validation_data=(X_test, y_test), batch_size=32,
                    callbacks=[early_stopping])

# Préparation de la gamme des époques pour le tracé.
epochs_range = range(1, len(history.history['accuracy']) + 1)

# Configuration de la taille et du layout des graphiques.
plt.figure(figsize=(12, 4))

# Tracé de la perte d'entraînement et de validation.
plt.subplot(1, 2, 1)
plt.plot(epochs_range, history.history['loss'], label='Training Loss')
plt.plot(epochs_range, history.history['val_loss'], label='Validation Loss')
plt.title('Loss')
plt.legend()

# Tracé de la précision d'entraînement et de validation.
plt.subplot(1, 2, 2)
plt.plot(epochs_range, history.history['accuracy'], label='Training Accuracy')
plt.plot(epochs_range, history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy')
plt.legend()

# Affichage des graphiques.
plt.tight_layout()
plt.show()

In [ ]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')