# --- Notebook: DNA Barcoding for plants

# 📦 Étape 1 : Transformer le fichier FASTA en CSV et choisir juste les plantes , supprimer les enregistrements qui ne portent pas d'informations

In [1]:
import csv
from Bio import SeqIO

fasta_path = r"C:\fasta_test\bold.fasta"
csv_path = r"C:\fasta_test\plants_cleaned.csv"

with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)

    headers = [
        "ID", "Gene_Region", "Country",
        "Kingdom", "Phylum", "Class", "Order", "Family", "Subfamily",
        "Genus", "Species", "Other",
        "Sequence"
    ]
    writer.writerow(headers)

    for record in SeqIO.parse(fasta_path, "fasta"):
        fields = record.description.split("|")

        if len(fields) > 3 and "Plantae" in fields[3]:
            taxonomy = fields[3].split(",")
            taxonomy += [None] * (9 - len(taxonomy))  # Fill missing

            species = taxonomy[8]  # Species
            sequence = str(record.seq).strip()

            # Skip if species is None or sequence is empty
            if not sequence or species is None or species.strip().lower() == "none":
                continue

            row = [
                fields[0],
                fields[1] if len(fields) > 1 else None,
                fields[2] if len(fields) > 2 else None
            ] + taxonomy[:9] + [sequence]

            writer.writerow(row)


# ✅ Étape 2 : Chargement et nettoyage du CSV

In [2]:
import pandas as pd
from collections import Counter

# Charger le fichier CSV généré

In [3]:
df = pd.read_csv(csv_path)

# Supprimer les lignes avec séquence vide ou courte

In [4]:
df = df[df["Sequence"].notna() & (df["Sequence"].str.len() > 100)]

# Supprimer les lignes avec "Species" manquant

In [5]:
df = df[df["Species"].notna() & (df["Species"].str.lower() != "none")]

# Supprimer les lignes où Gene_Region, Class, Order, Family, Genus sont vides ou None


In [6]:
df = df[
    df["Gene_Region"].notna() & (df["Gene_Region"].str.strip().str.lower() != "none") &
    df["Class"].notna() & (df["Class"].str.strip().str.lower() != "none") &
    df["Order"].notna() & (df["Order"].str.strip().str.lower() != "none") &
    df["Family"].notna() & (df["Family"].str.strip().str.lower() != "none") &
    df["Genus"].notna() & (df["Genus"].str.strip().str.lower() != "none")
]


# Supprimer les espèces trop rares (< 10 occurrences)

In [7]:
species_counts = Counter(df["Species"])
df = df[df["Species"].isin([sp for sp, count in species_counts.items() if count >= 10])]

# Sauvegarder le fichier nettoyé final

In [8]:
cleaned_path = r"C:\fasta_test\plants_cleaned_final.csv"
df.to_csv(cleaned_path, index=False)

# Visualiser quelques lignes

In [9]:
df.head()

Unnamed: 0,ID,Gene_Region,Country,Kingdom,Phylum,Class,Order,Family,Subfamily,Genus,Species,Other,Sequence
0,APOSA012-14,rbcLa,South Africa,Plantae,Tracheophyta,Magnoliopsida,Gentianales,Apocynaceae,,Schizoglossum,Schizoglossum atropurpureum,Schizoglossum atropurpureum subsp atropurpureum,AGTGTTGGATTCAAAGCCGGTGTTAAAGAGTACAAATTGACTTATT...
1,APOSA012-14,matK,South Africa,Plantae,Tracheophyta,Magnoliopsida,Gentianales,Apocynaceae,,Schizoglossum,Schizoglossum atropurpureum,Schizoglossum atropurpureum subsp atropurpureum,GATATACTAATACCCTACCCTGTTCATCTGGAAATCTTGGTTCAAA...
3,APOSA022-14,rbcLa,South Africa,Plantae,Tracheophyta,Magnoliopsida,Gentianales,Apocynaceae,,Schizoglossum,Schizoglossum atropurpureum,Schizoglossum atropurpureum subsp tridentatum,AGTGTTGGATTCAAAGCCGGTGTTAAAGAGTACAAATTGACTTATT...
4,APOSA022-14,matK,South Africa,Plantae,Tracheophyta,Magnoliopsida,Gentianales,Apocynaceae,,Schizoglossum,Schizoglossum atropurpureum,Schizoglossum atropurpureum subsp tridentatum,GATATACTAATACCCTACCCTGTTCATCTGGAAATCTTGGTTCAAA...
11,APOSA046-14,matK,South Africa,Plantae,Tracheophyta,Magnoliopsida,Gentianales,Apocynaceae,,Schizoglossum,Schizoglossum bidens,Schizoglossum bidens subsp bidens,TCTGGAAATCTTGGTTCAAACCCTTCGCTATTGGGTAAAGGATGCC...


# Supprimer les colonnes ID , Country , Kingdom , Phylum , subfamily et other.

In [10]:
df = df.drop(columns=["ID", "Country", "Kingdom", "Phylum", "Subfamily", "Other"])


In [11]:
df.head()

Unnamed: 0,Gene_Region,Class,Order,Family,Genus,Species,Sequence
0,rbcLa,Magnoliopsida,Gentianales,Apocynaceae,Schizoglossum,Schizoglossum atropurpureum,AGTGTTGGATTCAAAGCCGGTGTTAAAGAGTACAAATTGACTTATT...
1,matK,Magnoliopsida,Gentianales,Apocynaceae,Schizoglossum,Schizoglossum atropurpureum,GATATACTAATACCCTACCCTGTTCATCTGGAAATCTTGGTTCAAA...
3,rbcLa,Magnoliopsida,Gentianales,Apocynaceae,Schizoglossum,Schizoglossum atropurpureum,AGTGTTGGATTCAAAGCCGGTGTTAAAGAGTACAAATTGACTTATT...
4,matK,Magnoliopsida,Gentianales,Apocynaceae,Schizoglossum,Schizoglossum atropurpureum,GATATACTAATACCCTACCCTGTTCATCTGGAAATCTTGGTTCAAA...
11,matK,Magnoliopsida,Gentianales,Apocynaceae,Schizoglossum,Schizoglossum bidens,TCTGGAAATCTTGGTTCAAACCCTTCGCTATTGGGTAAAGGATGCC...


# Préparation des données 

## Vectorisation des séquences (k-mers)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='char', ngram_range=(6, 6))  
X_seq = vectorizer.fit_transform(df["Sequence"])

## Encodage de Gene_region

In [13]:
from sklearn.preprocessing import OneHotEncoder

region_encoder = OneHotEncoder()
X_region = region_encoder.fit_transform(df[["Gene_Region"]])


## Fusion des deux vecteurs 

In [14]:
from scipy.sparse import hstack

X = hstack([X_seq, X_region])  # Entrée finale pour le modèle

## Encodage de la variable cible Species

In [15]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df["Species"])


## Séparation des données de test et d'entrainement 

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


# Entrainement des modèles 
## Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)



## Naive Bayes

In [18]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)


## SVM 

In [19]:
from sklearn.linear_model import SGDClassifier

svm_model = SGDClassifier(loss='hinge', max_iter=5000, tol=1e-3, random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Evaluation des modèles

## Classification Report

In [20]:
from sklearn.metrics import classification_report , accuracy_score, f1_score, precision_score, recall_score

print("--- Random Forest ---")
print(classification_report(y_test, rf_model.predict(X_test), target_names=le.classes_,zero_division=0))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("F1 macro:", f1_score(y_test, y_pred_rf, average='macro',zero_division=0))
print("Precision macro:", precision_score(y_test, y_pred_rf, average='macro',zero_division=0))
print("Recall macro:", recall_score(y_test, y_pred_rf, average='macro',zero_division=0))

print("--- Naive Bayes ---")
print(classification_report(y_test, nb_model.predict(X_test), target_names=le.classes_,zero_division=0))
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("F1 macro:", f1_score(y_test, y_pred_nb, average='macro',zero_division=0))
print("Precision macro:", precision_score(y_test, y_pred_nb, average='macro',zero_division=0))
print("Recall macro:", recall_score(y_test, y_pred_nb, average='macro',zero_division=0))



print("--- SVM (SGDClassifier) ---")
print(classification_report(y_test, y_pred_svm, target_names=le.classes_,zero_division=0))
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("F1 macro:", f1_score(y_test, y_pred_svm, average='macro',zero_division=0))
print("Precision macro:", precision_score(y_test, y_pred_svm, average='macro',zero_division=0))
print("Recall macro:", recall_score(y_test, y_pred_svm, average='macro',zero_division=0))

--- Random Forest ---
                             precision    recall  f1-score   support

           Abrus pulchellus       1.00      0.25      0.40         4
             Acacia robusta       1.00      1.00      1.00         2
       Achillea millefolium       1.00      1.00      1.00         4
    Achnatherum occidentale       1.00      1.00      1.00         3
            Acmispon glaber       1.00      1.00      1.00         4
              Alnus viridis       1.00      1.00      1.00         4
           Anchusa arvensis       1.00      1.00      1.00         2
      Androsace chamaejasme       1.00      1.00      1.00         4
       Antennaria friesiana       1.00      0.67      0.80         3
           Antennaria media       0.50      1.00      0.67         3
     Antennaria monocephala       0.00      0.00      0.00         2
     Anthoxanthum monticola       1.00      1.00      1.00         7
         Arabidopsis lyrata       0.75      1.00      0.86         3
     Arctag

# Exemple de prédiction d'un espèce

## Random forest

In [21]:

def predict_species_rf(sequence: str, gene_region: str):
    # Vectorisation de la séquence
    seq_vec = vectorizer.transform([sequence])
    
    # Encodage one-hot de la région
    import pandas as pd
    region_vec = region_encoder.transform(pd.DataFrame([[gene_region]], columns=["Gene_Region"]))

    
    # Fusion
    X_input = hstack([seq_vec, region_vec])
    
    # Prédiction
    pred = rf_model.predict(X_input)
    return le.inverse_transform(pred)[0]

sequence="GGTGTTGGATTTCAAGCTGGTGTTAAAGATTATAAATTGACTTACTACACCCCAGAGTATGAAACTAAGGATACTGATATCTTGGCAGCATTCCGAGTAAGTCCTCAGCCTGGGGTTCCGCCCGAAGAAGCAGGGGCTGCAGTAGCTGCCGAATCTTCTACTGGTACATGGACAACTGTTTGGACTGATGGACTTACCAGTCTTGATCGTTACAAAGGACGATGCTATCACATCGAGCCTGTTGCTGGGGAAGACAACCAATGGATCTGTTATGTAGCTTATCCATTAGACCTATTTGAGGAGGGTTCCGTTACTAACATGTTTACTTCCATTGTGGGTAACGTATTTGGGTTCAAAGCCCTACGTGCTCTACGTTTGGAGGATCTACGAATTCCCCCTACTTATTCAAAAACTTTCCAAGGCCCGCCTCATGGTATCCAAGTTGAAAGAGATAAGTTGAACAAGTATGGTCGTCCTTTATTGGGATGTACTATTAAACCAAAATTGGGATTATCCGCAAAAAATTATGGTAGAGCGTGTTATGAGTGTCTA"
gene_region="rbcLa"
predicted = predict_species_rf(sequence, gene_region)
print("Espèce prédite (Random Forest) :", predicted)



Espèce prédite (Random Forest) : Hordeum jubatum


## SVM

In [22]:
def predict_species_svm(sequence: str, gene_region: str):
    import pandas as pd
    seq_vec = vectorizer.transform([sequence])
    region_vec = region_encoder.transform(pd.DataFrame([[gene_region]], columns=["Gene_Region"]))
    X_input = hstack([seq_vec, region_vec])
    pred = svm_model.predict(X_input)
    return le.inverse_transform(pred)[0]

predicted_svm = predict_species_svm(sequence, gene_region)
print("Espèce prédite (SVM) :", predicted_svm)

Espèce prédite (SVM) : Hordeum jubatum


# Amélioration du model

## K-Fold Cross-Validation

In [23]:
from sklearn.model_selection import KFold
import numpy as np

for k in [3, 5, 7, 10]:
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    
    acc_scores = []
    for train_idx, test_idx in kf.split(X):
        rf_model.fit(X[train_idx], y[train_idx])
        y_pred = rf_model.predict(X[test_idx])
        acc_scores.append(accuracy_score(y[test_idx], y_pred))

    print(f"{k}-Fold -> Accuracy moyenne : {np.mean(acc_scores):.4f} | Écart-type : {np.std(acc_scores):.4f}")


3-Fold -> Accuracy moyenne : 0.8260 | Écart-type : 0.0069
5-Fold -> Accuracy moyenne : 0.8446 | Écart-type : 0.0047
7-Fold -> Accuracy moyenne : 0.8522 | Écart-type : 0.0123
10-Fold -> Accuracy moyenne : 0.8594 | Écart-type : 0.0104


In [24]:

# Configuration de la validation croisée
kf = KFold(n_splits=7, shuffle=True, random_state=42)

# Initialiser les listes de scores
acc_scores = []
f1_scores = []
precision_scores = []
recall_scores = []

# Boucle sur chaque fold
for train_index, test_index in kf.split(X):
    X_train_fold, X_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    
    # Entraîner le modèle
    rf_model.fit(X_train_fold, y_train_fold)
    y_pred = rf_model.predict(X_test_fold)
    
    # Calculer les scores avec gestion du zero_division
    acc_scores.append(accuracy_score(y_test_fold, y_pred))
    f1_scores.append(f1_score(y_test_fold, y_pred, average='macro', zero_division=0))
    precision_scores.append(precision_score(y_test_fold, y_pred, average='macro', zero_division=0))
    recall_scores.append(recall_score(y_test_fold, y_pred, average='macro', zero_division=0))

# Affichage des résultats
print("=== Résultats Cross-Validation (7 folds) ===")
print(f"Accuracy moyenne      : {np.mean(acc_scores):.4f}")
print(f"F1 macro moyenne      : {np.mean(f1_scores):.4f}")
print(f"Precision macro moy.  : {np.mean(precision_scores):.4f}")
print(f"Recall macro moyenne  : {np.mean(recall_scores):.4f}")
print(f"Écart-type Accuracy   : {np.std(acc_scores):.4f}")


=== Résultats Cross-Validation (7 folds) ===
Accuracy moyenne      : 0.8522
F1 macro moyenne      : 0.8117
Precision macro moy.  : 0.8297
Recall macro moyenne  : 0.8295
Écart-type Accuracy   : 0.0123


## StratifiedKFold 

In [25]:
from sklearn.model_selection import StratifiedKFold

for k in [3, 5, 7, 10]:
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    acc_scores = []

    for train_idx, test_idx in skf.split(X, y):
        rf_model.fit(X[train_idx], y[train_idx])
        y_pred = rf_model.predict(X[test_idx])
        acc_scores.append(accuracy_score(y[test_idx], y_pred))

    print(f"{k}-Fold Stratifié -> Accuracy moyenne : {np.mean(acc_scores):.4f} | Écart-type : {np.std(acc_scores):.4f}")


3-Fold Stratifié -> Accuracy moyenne : 0.8449 | Écart-type : 0.0043
5-Fold Stratifié -> Accuracy moyenne : 0.8642 | Écart-type : 0.0108
7-Fold Stratifié -> Accuracy moyenne : 0.8624 | Écart-type : 0.0098
10-Fold Stratifié -> Accuracy moyenne : 0.8627 | Écart-type : 0.0078


## K-mers  avec CountVectorizer

In [26]:
from sklearn.preprocessing import LabelEncoder

# Réencodage à l'intérieur de la boucle (important si df change)
le = LabelEncoder()
y = le.fit_transform(df["Species"])

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer

k_values = [3, 4, 5, 6, 7, 8]
accuracy_by_k = []

for k in k_values:
    print(f"\n🔎 Test avec k = {k}")
    
    vectorizer = CountVectorizer(analyzer='char', ngram_range=(k, k))
    X_seq = vectorizer.fit_transform(df["Sequence"])
    
    X = hstack([X_seq, X_region])
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)  
    y_pred = rf_model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    accuracy_by_k.append(acc)

    print(f"✅ Accuracy pour k = {k} : {acc:.4f}")

best_k = k_values[np.argmax(accuracy_by_k)]
print(f"\n🏆 Meilleur k : {best_k} avec accuracy = {max(accuracy_by_k):.4f}")



🔎 Test avec k = 3
✅ Accuracy pour k = 3 : 0.8466

🔎 Test avec k = 4
✅ Accuracy pour k = 4 : 0.8571

🔎 Test avec k = 5
✅ Accuracy pour k = 5 : 0.8647

🔎 Test avec k = 6
✅ Accuracy pour k = 6 : 0.8707

🔎 Test avec k = 7
✅ Accuracy pour k = 7 : 0.8677

🔎 Test avec k = 8
✅ Accuracy pour k = 8 : 0.8677

🏆 Meilleur k : 6 avec accuracy = 0.8707



## kmers avec TdfidfVectorizer

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

k_values = [3, 4, 5, 6, 7, 8]
accuracy_by_k = []

for k in k_values:
    print(f"\n🔎 Test avec TF-IDF et k = {k}")
    
    # 1. TF-IDF vectorisation des k-mers
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(k, k))
    X_seq = vectorizer.fit_transform(df["Sequence"])
    
    # 2. Concaténation avec la région
    X = hstack([X_seq, X_region])
    
    # 3. Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # 4. Entraînement du modèle
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)  
    
    # 5. Évaluation
    y_pred = rf_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracy_by_k.append(acc)

    print(f"✅ Accuracy pour k = {k} : {acc:.4f}")

# Résultat final
best_k = k_values[accuracy_by_k.index(max(accuracy_by_k))]
print(f"\n🏆 Meilleur k avec TF-IDF : {best_k} | Accuracy = {max(accuracy_by_k):.4f}")



🔎 Test avec TF-IDF et k = 3
✅ Accuracy pour k = 3 : 0.8331

🔎 Test avec TF-IDF et k = 4
✅ Accuracy pour k = 4 : 0.8316

🔎 Test avec TF-IDF et k = 5
✅ Accuracy pour k = 5 : 0.8286

🔎 Test avec TF-IDF et k = 6
✅ Accuracy pour k = 6 : 0.8346

🔎 Test avec TF-IDF et k = 7
✅ Accuracy pour k = 7 : 0.8361

🔎 Test avec TF-IDF et k = 8
✅ Accuracy pour k = 8 : 0.8316

🏆 Meilleur k avec TF-IDF : 7 | Accuracy = 0.8361


## Optimisation des hyperparamètres

In [29]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Grille d'hyperparamètres à tester
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 20, 40],
    'min_samples_split': [2, 5],
    'max_features': ['sqrt', 'log2']
}

# Modèle de base
rf = RandomForestClassifier(random_state=42)

# Optimisation avec validation croisée
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Résultats
print("✅ Meilleurs paramètres :", grid_search.best_params_)
print("🎯 Meilleure score de cross-validation :", grid_search.best_score_)

# Tester sur test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))


Fitting 5 folds for each of 24 candidates, totalling 120 fits
✅ Meilleurs paramètres : {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200}
🎯 Meilleure score de cross-validation : 0.8008233861489883
                             precision    recall  f1-score   support

           Abrus pulchellus       1.00      0.25      0.40         4
             Acacia robusta       1.00      1.00      1.00         2
       Achillea millefolium       0.80      1.00      0.89         4
    Achnatherum occidentale       1.00      1.00      1.00         3
            Acmispon glaber       1.00      1.00      1.00         4
              Alnus viridis       1.00      1.00      1.00         4
           Anchusa arvensis       1.00      1.00      1.00         2
      Androsace chamaejasme       1.00      1.00      1.00         4
       Antennaria friesiana       0.00      0.00      0.00         3
           Antennaria media       0.43      1.00      0.60         3
     

## XGBoost

In [31]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

print("🎯 Résultats XGBoost")
print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))


🎯 Résultats XGBoost
                             precision    recall  f1-score   support

           Abrus pulchellus       1.00      0.25      0.40         4
             Acacia robusta       1.00      1.00      1.00         2
       Achillea millefolium       1.00      1.00      1.00         4
    Achnatherum occidentale       1.00      1.00      1.00         3
            Acmispon glaber       1.00      1.00      1.00         4
              Alnus viridis       1.00      0.75      0.86         4
           Anchusa arvensis       1.00      1.00      1.00         2
      Androsace chamaejasme       1.00      1.00      1.00         4
       Antennaria friesiana       0.33      0.33      0.33         3
           Antennaria media       0.40      0.67      0.50         3
     Antennaria monocephala       0.00      0.00      0.00         2
     Anthoxanthum monticola       1.00      1.00      1.00         7
         Arabidopsis lyrata       1.00      1.00      1.00         3
     Arctagro

# Model de deep learning
## Multilayer Perceptron

In [32]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# y = target
le = LabelEncoder()
y_encoded = le.fit_transform(df["Species"])
y_categorical = to_categorical(y_encoded)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y_categorical, test_size=0.2, random_state=42)


In [33]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(y_train.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               28302848  
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 149)               38293     
                                                                 
Total params: 28,472,469
Trainable params: 28,472,469
Non-trainable params: 0
_________________________________________________________________


In [34]:
history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.1)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [35]:
loss, acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {acc:.4f}")

Test accuracy: 0.8165


## Preparation des données pour CNN et RNN 

In [36]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Encoder la séquence comme une suite d'entiers (A, C, G, T)
tokenizer = Tokenizer(char_level=True)  # caractère par caractère
tokenizer.fit_on_texts(df["Sequence"])
X_seq_int = tokenizer.texts_to_sequences(df["Sequence"])

# Padding pour uniformiser la longueur
max_len = max(len(seq) for seq in X_seq_int)
X_seq_padded = pad_sequences(X_seq_int, maxlen=max_len, padding='post')

# Encodage de la target (Species)
le = LabelEncoder()
y = le.fit_transform(df["Species"])
num_classes = len(le.classes_)

# One-hot encoding de la cible
from tensorflow.keras.utils import to_categorical
y_cat = to_categorical(y, num_classes=num_classes)

# Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_seq_padded, y_cat, test_size=0.2, random_state=42)


## CNN

In [40]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

vocab_size = len(tokenizer.word_index) + 1

cnn_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len),
    Conv1D(128, kernel_size=7, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.summary()
history_cnn = cnn_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)
loss_cnn, acc_cnn = cnn_model.evaluate(X_test, y_test, verbose=0)
print(f"✅ CNN Accuracy: {acc_cnn:.4f}")

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 1626, 128)         1920      
                                                                 
 conv1d_2 (Conv1D)           (None, 1620, 128)         114816    
                                                                 
 global_max_pooling1d_2 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_9 (Dense)             (None, 256)               33024     
                                                                 
 dropout_5 (Dropout)         (None, 256)               0         
                                                                 
 dense_10 (Dense)            (None, 149)               38293     
                                                      

## RNN

In [41]:
from tensorflow.keras.layers import LSTM

rnn_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len),
    LSTM(128),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

rnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
rnn_model.summary()
history_rnn = rnn_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)
loss_rnn, acc_rnn = rnn_model.evaluate(X_test, y_test, verbose=0)
print(f"✅ RNN (LSTM) Accuracy: {acc_rnn:.4f}")



Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 1626, 128)         1920      
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dense_11 (Dense)            (None, 256)               33024     
                                                                 
 dropout_6 (Dropout)         (None, 256)               0         
                                                                 
 dense_12 (Dense)            (None, 149)               38293     
                                                                 
Total params: 204,821
Trainable params: 204,821
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoc