In [2]:
import pandas as pd
from nltk.corpus import stopwords
import nltk
import stanza
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

import pickle

In [3]:
bdd = pd.read_excel("Projet_spam.xlsx")
emails = bdd['email'].astype(str)

In [4]:
nltk.download("stopwords")
stop_words_fr = stopwords.words('french')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
nlp = stanza.Pipeline("fr", processors="tokenize,mwt,pos,lemma")
def normalize(text):
    doc = nlp(text)
    lemmatized_filtered_text = [word.lemma.lower() for sentence in doc.sentences for word in sentence.words if word.lemma.lower() not in stop_words_fr]
    return " ".join(lemmatized_filtered_text)

2025-03-24 16:23:11 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 424kB [00:00, 7.71MB/s]                    
2025-03-24 16:23:11 INFO: Downloaded file to C:\Users\pc\stanza_resources\resources.json
2025-03-24 16:23:12 INFO: Loading these models for language: fr (French):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2025-03-24 16:23:12 INFO: Using device: cpu
2025-03-24 16:23:12 INFO: Loading: tokenize
2025-03-24 16:23:14 INFO: Loading: mwt
2025-03-24 16:23:14 INFO: Loading: pos
2025-03-24 16:23:16 INFO: Loading: lemma
2025-03-24 16:23:17 INFO: Done loading processors!


In [None]:
X = emails.apply(normalize)

y = bdd['type'].map({'Spam': 1, 'Non spam': 0})
X = X.to_frame()
X['type'] = y

X.to_excel('BDD_normalisé.xlsx', index = False, engine = 'openpyxl')

In [15]:
bdd_norm = pd.read_excel("BDD_normalisé.xlsx")

In [16]:
vectorizer = TfidfVectorizer(stop_words=stop_words_fr)
vect = vectorizer.fit_transform(bdd_norm["email"])
print(vect.toarray().shape)

(201, 1349)


In [17]:
selector = SelectKBest(chi2, k=50)
X_new = selector.fit_transform(vect, bdd['type'])
print(X_new.toarray().shape)
features = vectorizer.get_feature_names_out()
indices = selector.get_support(indices=True)
best_words = [features[i] for i in indices]
print(best_words)

(201, 50)
['00', '10', '2025', '30', 'appareil', 'argent', 'attention', 'avril', 'bien', 'bit', 'bonjour', 'cadeau', 'cher', 'cliquer', 'concernant', 'consulter', 'contacter', 'cordialement', 'demande', 'donnée', 'détecter', 'entretien', 'fichier', 'gagner', 'https', 'hésiter', 'ici', 'immédiatement', 'joindre', 'livraison', 'ly', 'maintenant', 'mars', 'merci', 'monsieur', 'ouvrir', 'paris', 'prix', 'prévoir', 'question', 'réclamer', 'si', 'souhaiter', 'suspect', 'tenir', 'télécharger', 'université', 'urgent', 'équipe', 'éviter']


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_new, bdd['type'], test_size = 0.2, random_state = 0)

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f'précision du modèle : {round(acc * 100, 2)} %')

précision du modèle : 97.56 %


In [19]:
def vectorize(text):
    text = normalize(text)
    vect_text = vectorizer.transform([text])
    vect_selector = selector.transform(vect_text)
    return vect_selector
text = "gfht th rgfththhdry "
print(model.predict(vectorize(text))[0])

Spam


In [13]:
with open('naivebayes.pkl', 'wb') as file:
    pickle.dump(model, file)
with open("vectorizer_selector.pkl", "wb") as file:
    pickle.dump((vectorizer, selector), file)