In [None]:
# Etape 1 : Prétraitement des données ----------------------------------------------------------

import pandas as pd
from nltk.corpus import stopwords
import nltk
import stanza
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

import pickle

In [None]:
bdd = pd.read_excel("Projet_spam.xlsx")
emails = bdd['email'].astype(str)

In [None]:
nltk.download("stopwords")
stop_words_fr = stopwords.words('french')

In [None]:
nlp = stanza.Pipeline("fr", processors="tokenize,mwt,pos,lemma")
def normalize(text):
    doc = nlp(text)
    lemmatized_filtered_text = [word.lemma.lower() for sentence in doc.sentences for word in sentence.words if word.lemma.lower() not in stop_words_fr]
    return " ".join(lemmatized_filtered_text)

In [None]:
X = emails.apply(normalize)

y = bdd['type'].map({'Spam': 1, 'Non spam': 0})
X = X.to_frame()
X['type'] = y

X.to_excel('BDD_normalisé.xlsx', index = False, engine = 'openpyxl')

In [None]:
# Etape 2 : Création de modele --------------------------------------------------------

bdd_norm = pd.read_excel("BDD_normalisé.xlsx")

In [None]:
vars = ['bit', 'contacter', 'fichier', 'ly', 'équipe', 'bonus', 'récompense', 'abonnement', 'gratuit', 'urgent', 'lien', 'virus', 'information', 'ici', 'cliquer', 'télécharger', 
             'cadeau', 'profiter', 'ouvrir', 'joindre', 'récupérer', 'sécurité', 'protéger', 'maintenant', 'prix', 'bancaire', 'suspendre', 
             'dernier', 'recevoir', 'exclusive', 'paiement', 'voici', 'seulement', 'limité', 'succés', 'compte', 'mise à jour', 'exceptionnel',
             'offre', 'instantané', 'félicitation', 'gagner', 'immédiat', 'rapide','!', 'appareil', 'éviter', 'spécial', 'argent', '€', '$', 'livraison',
             'paypal', 'virement', 'verrouiller', 'sécuriser', 'vite', 'choc', 'vérifier', 'attention', 'obtenir', 'uniquement', 'smartphone', 'rater', 'tentative']

vectorizer = TfidfVectorizer(stop_words=stop_words_fr, vocabulary=vars)
vect = vectorizer.fit_transform(bdd_norm["email"])
a = vect.toarray()
print(vars)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(vect, bdd_norm['type'], test_size = 0.2, random_state = 0)

svm = SVC(kernel='poly')
svm.fit(X_train, y_train)
y_svm = svm.predict(X_test)

nb = MultinomialNB()
nb.fit(X_train, y_train)
y_nb = nb.predict(X_test)

# Etape 3 : Evaluation et exportation des modèles ------------------------------------------------------------

acc_svm = accuracy_score(y_test, y_svm)
acc_nb = accuracy_score(y_test, y_nb)
print(f'précision du modèle Naives Bayes : {round(acc_nb * 100, 2)} %')
print(f'précision du modèle SVM : {round(acc_svm * 100, 2)} %')

cm_svm = confusion_matrix(y_test, y_svm, labels=[1, 0])
cm_nb = confusion_matrix(y_test, y_nb, labels=[1, 0])
print(f'Matrice de confusion du modèle Naive Bayes :\n {cm_nb}')
print(f'Matrice de confusion du modèle SVM :\n {cm_svm}')
prc_svm = cm_svm[0][0] / (cm_svm[0][0] + cm_svm[1][0])
prc_nb = cm_nb[0][0] / (cm_nb[0][0] + cm_nb[1][0])

print(f'precision du modele SVM : {round(prc_svm*100, 2)} %')
print(f'precision du modele Naives Bayes : {round(prc_nb*100, 2)} %')



In [None]:
def vectorize(text):
    text = normalize(text)
    vect_text = vectorizer.transform([text])
    return vect_text


In [None]:
with open('naivebayes.pkl', 'wb') as file:
    pickle.dump(nb, file)

with open("vectorizer.pkl", "wb") as file:
    pickle.dump((vectorizer), file)
    
with open('svm.pkl', 'wb') as file:
    pickle.dump(svm, file)