In [5]:
# Etape 1 : Prétraitement des données ----------------------------------------------------------

import pandas as pd
from nltk.corpus import stopwords
import nltk
import stanza
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import os
import pickle

In [6]:
bdd = pd.read_excel("Projet_spam.xlsx")
emails = bdd['email'].astype(str)

In [7]:
nltk.download("stopwords")
stop_words_fr = stopwords.words('french')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
stanza.download("fr")
nlp = stanza.Pipeline("fr", processors="tokenize,mwt,pos,lemma")
def normalize(text):
    doc = nlp(text)
    lemmatized_filtered_text = [word.lemma.lower() for sentence in doc.sentences for word in sentence.words if word.lemma.lower() not in stop_words_fr]
    return " ".join(lemmatized_filtered_text)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 432kB [00:00, 9.91MB/s]                    
2025-06-14 08:30:28 INFO: Downloaded file to C:\Users\pc\stanza_resources\resources.json
2025-06-14 08:30:28 INFO: Downloading default packages for language: fr (French) ...
2025-06-14 08:30:30 INFO: File exists: C:\Users\pc\stanza_resources\fr\default.zip
2025-06-14 08:30:32 INFO: Finished downloading models and saved to C:\Users\pc\stanza_resources
2025-06-14 08:30:32 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 432kB [00:00, 7.75MB/s]                    
2025-06-14 08:30:32 INFO: Downloaded file to C:\Users\pc\stanza_resources\resources.json
2025-06-14 08:30:33 INFO: Loading these models for

In [9]:
if not os.path.exists("BDD_normalisé.xlsx"):
    X = emails.apply(normalize)

    y = bdd['type'].map({'Spam': 1, 'Non spam': 0})
    X = X.to_frame()
    X['type'] = y

    X.to_excel('BDD_normalisé.xlsx', index = False, engine = 'openpyxl')

In [10]:
# Etape 2 : Création de modele --------------------------------------------------------

bdd_norm = pd.read_excel("BDD_normalisé.xlsx")

In [11]:
vars = ['bit', 'contacter', 'fichier', 'ly', 'équipe', 'bonus', 'récompense', 'abonnement', 'gratuit', 'urgent', 'lien', 'virus', 'information', 'ici', 'cliquer', 'télécharger', 
             'cadeau', 'profiter', 'ouvrir', 'joindre', 'récupérer', 'sécurité', 'protéger', 'maintenant', 'prix', 'bancaire', 'suspendre', 
             'dernier', 'recevoir', 'exclusive', 'paiement', 'voici', 'seulement', 'limité', 'succés', 'compte', 'mise à jour', 'exceptionnel',
             'offre', 'instantané', 'félicitation', 'gagner', 'immédiat', 'rapide','!', 'appareil', 'éviter', 'spécial', 'argent', '€', '$', 'livraison',
             'paypal', 'virement', 'verrouiller', 'sécuriser', 'vite', 'choc', 'vérifier', 'attention', 'obtenir', 'uniquement', 'smartphone', 'rater', 'tentative']

vectorizer = TfidfVectorizer(stop_words=stop_words_fr, vocabulary=vars)
vect = vectorizer.fit_transform(bdd_norm["email"])
a = vect.toarray()
print(vars)

['bit', 'contacter', 'fichier', 'ly', 'équipe', 'bonus', 'récompense', 'abonnement', 'gratuit', 'urgent', 'lien', 'virus', 'information', 'ici', 'cliquer', 'télécharger', 'cadeau', 'profiter', 'ouvrir', 'joindre', 'récupérer', 'sécurité', 'protéger', 'maintenant', 'prix', 'bancaire', 'suspendre', 'dernier', 'recevoir', 'exclusive', 'paiement', 'voici', 'seulement', 'limité', 'succés', 'compte', 'mise à jour', 'exceptionnel', 'offre', 'instantané', 'félicitation', 'gagner', 'immédiat', 'rapide', '!', 'appareil', 'éviter', 'spécial', 'argent', '€', '$', 'livraison', 'paypal', 'virement', 'verrouiller', 'sécuriser', 'vite', 'choc', 'vérifier', 'attention', 'obtenir', 'uniquement', 'smartphone', 'rater', 'tentative']


In [12]:
X_train, X_test, y_train, y_test = train_test_split(vect, bdd_norm['type'], test_size = 0.2, random_state=42)

svm = SVC(kernel='poly')
svm.fit(X_train, y_train)
y_svm = svm.predict(X_test)

nb = MultinomialNB()
nb.fit(X_train, y_train)
y_nb = nb.predict(X_test)

acc_svm = accuracy_score(y_test, y_svm)
acc_nb = accuracy_score(y_test, y_nb)

print(f'précision du modèle Naives Bayes : {round(acc_nb * 100, 2)} %')
print(f'précision du modèle SVM : {round(acc_svm * 100, 2)} %')

cm_svm = confusion_matrix(y_test, y_svm, labels=[0, 1])
cm_nb = confusion_matrix(y_test, y_nb, labels=[0, 1])

print(f'Matrice de confusion du modèle Naive Bayes :\n {cm_nb}')
print(f'Matrice de confusion du modèle SVM :\n {cm_svm}')



précision du modèle Naives Bayes : 75.61 %
précision du modèle SVM : 82.93 %
Matrice de confusion du modèle Naive Bayes :
 [[18  5]
 [ 5 13]]
Matrice de confusion du modèle SVM :
 [[20  3]
 [ 4 14]]


In [13]:
def vectorize(text):
    text = normalize(text)
    vect_text = vectorizer.transform([text])
    return vect_text


In [14]:
with open('naivebayes.pkl', 'wb') as file:
    pickle.dump(nb, file)

with open("vectorizer.pkl", "wb") as file:
    pickle.dump((vectorizer), file)
    
with open('svm.pkl', 'wb') as file:
    pickle.dump(svm, file)