In [43]:
import pandas as pd
import plotly.express as px
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

# Import des données


In [44]:
df = pd.read_csv(
    "../SMSSpamCollection.txt", delimiter="\t", header=None, names=["spam", "text"]
)
df

Unnamed: 0,spam,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


On cherche à modéliser la probabilité qu'un message soit un spam à partir de son contenu. Il nous faut donc un modèle de classification binaire. On peux donc utiliser un modèle linaire SVC, ou un naive bayes.

# Prétraitement des données

Je vais d'abord binariser la variable cible, sous forme d'un entier (0 ou 1).

In [45]:
df["spam"] = df["spam"].map({'spam': 1, 'ham': 0}).astype(int)


In [46]:
df

Unnamed: 0,spam,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


# Approche par constructions de feature descriptives des messages

## Création des features

On vérifie la présence d'un numéro de téléphone.

In [47]:
df['has_phone_number'] = df['text'].apply(lambda x: 1 if re.search(r'\b\d{10,}\b', x) else 0)
df

Unnamed: 0,spam,text,has_phone_number
0,0,"Go until jurong point, crazy.. Available only ...",0
1,0,Ok lar... Joking wif u oni...,0
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,0,U dun say so early hor... U c already then say...,0
4,0,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,1
5568,0,Will ü b going to esplanade fr home?,0
5569,0,"Pity, * was in mood for that. So...any other s...",0
5570,0,The guy did some bitching but I acted like i'd...,0


On vérifie la présence de monnaies.

In [48]:
df['has_currency_symbol'] = df['text'].apply(lambda x: 1 if re.search(r'[\$\€\£]', x) else 0)
df

Unnamed: 0,spam,text,has_phone_number,has_currency_symbol
0,0,"Go until jurong point, crazy.. Available only ...",0,0
1,0,Ok lar... Joking wif u oni...,0,0
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,0,0
3,0,U dun say so early hor... U c already then say...,0,0
4,0,"Nah I don't think he goes to usf, he lives aro...",0,0
...,...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,1,1
5568,0,Will ü b going to esplanade fr home?,0,0
5569,0,"Pity, * was in mood for that. So...any other s...",0,0
5570,0,The guy did some bitching but I acted like i'd...,0,0


In [49]:
df[df["spam"] == 1]["text"].iloc[0]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

On calcule la longueur des messages.

In [50]:
df['message_length'] = df['text'].apply(len)
df

Unnamed: 0,spam,text,has_phone_number,has_currency_symbol,message_length
0,0,"Go until jurong point, crazy.. Available only ...",0,0,111
1,0,Ok lar... Joking wif u oni...,0,0,29
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,0,0,155
3,0,U dun say so early hor... U c already then say...,0,0,49
4,0,"Nah I don't think he goes to usf, he lives aro...",0,0,61
...,...,...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,1,1,160
5568,0,Will ü b going to esplanade fr home?,0,0,36
5569,0,"Pity, * was in mood for that. So...any other s...",0,0,57
5570,0,The guy did some bitching but I acted like i'd...,0,0,125


## Entrainement et évaluation du modèle

In [51]:

X = df[['message_length', 'has_phone_number', 'has_currency_symbol']]
y = df['spam']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.94      1.00      0.97       966
           1       0.97      0.61      0.75       149

    accuracy                           0.95      1115
   macro avg       0.96      0.80      0.86      1115
weighted avg       0.95      0.95      0.94      1115



# Approche par Vectorisation de text (NLP)

Il faut donc convertir le texte en une représentation numérique. J'utilise pour cela TfidfVectorizer.  C'est une technique de pondération des mots utilisée dans la récupération d'informations et le traitement du langage naturel (NLP). Elle évalue l'importance d'un mot dans un document par rapport à une collection de documents (corpus).
- TF (Term Frequency) : Nombre de fois qu'un mot apparaît dans un document, divisé par le nombre total de mots dans ce document.
- IDF (Inverse Document Frequency) : Mesure l'importance d'un mot en tenant compte de sa fréquence dans tout le corpus. Moins un mot est fréquent dans les autres documents, plus sa valeur IDF est élevée.

In [52]:
vectorizer = CountVectorizer(stop_words='english')
test = ["hi hi juliette juliette juliette","hi juliette","koi"]
testVect = vectorizer.fit_transform(test)
pd.DataFrame.sparse.from_spmatrix(testVect)

Unnamed: 0,0,1,2
0,2,3,0
1,1,1,0
2,0,0,1


In [53]:
vectorizer = TfidfVectorizer(stop_words='english')
test = ["hi hi juliette juliette juliette","hi juliette","koi"]
testVect = vectorizer.fit_transform(test)
pd.DataFrame.sparse.from_spmatrix(testVect)

Unnamed: 0,0,1,2
0,0.5547,0.83205,0.0
1,0.707107,0.707107,0.0
2,0.0,0.0,1.0


In [54]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])
y = df['spam']


In [55]:
X_df = pd.DataFrame.sparse.from_spmatrix(X)

In [56]:
X_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8434,8435,8436,8437,8438,8439,8440,8441,8442,8443
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Utilisation du modèle naive bayes

On sépare les données d'entrainement et de tests.

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)


On entraine le modèle.

In [58]:
model = BernoulliNB()
model.fit(X_train, y_train)


On évalue le modèle sur les données de test.

In [59]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9669777458722182
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      2416
           1       0.99      0.76      0.86       370

    accuracy                           0.97      2786
   macro avg       0.98      0.88      0.92      2786
weighted avg       0.97      0.97      0.97      2786



Avec la méthode de cross validation : 

In [60]:
print(cross_val_score(model, X, y, cv=3))

[0.97308934 0.97092084 0.97415186]


Le modèle est assez performant mais 25% des vrais spams n'ont pas été détectés (faux négatifs), donc non suffisant.

In [61]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

documents = df["text"]
labels = df["spam"]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

model = BernoulliNB()
model.fit(X, labels)
terms = vectorizer.get_feature_names_out()
log_probabilities = model.feature_log_prob_
term_weights = pd.DataFrame(
    log_probabilities.T,
    index=terms,
    columns=['spam', 'non spam']
)

term_weights_sorted = term_weights.sort_values(by='non spam', ascending=False)
term_weights_sorted.head(30)


Unnamed: 0,spam,non spam
to,-1.375374,-0.468136
call,-3.074809,-0.822681
you,-1.274121,-1.125678
your,-2.593102,-1.189393
now,-2.805227,-1.366466
for,-2.381661,-1.425782
or,-3.079303,-1.436955
free,-4.387636,-1.477075
the,-1.714637,-1.494775
txt,-5.842923,-1.588301


In [62]:
term_weights_sorted.reset_index()[term_weights_sorted.reset_index()["index"] == "earn"]

Unnamed: 0,index,spam,non spam
2940,earn,-7.383368,-6.618739


## Utilisation du modèle linéraire SVC

On sépare les données d'entrainement et de tests.

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


On entraine le modèle.

In [64]:
model = LinearSVC()
model.fit(X_train, y_train)


On évalue le modèle sur les données de test.

In [65]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9838565022421525
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.99      0.89      0.94       149

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



Avec la méthode de cross validation :

In [66]:
print(cross_val_score(model, X, y, cv=3))

[0.98116254 0.98061389 0.98007539]


On obtient une précision plutot élevée, avec une accuracy de 98%, même si certains spams sont toujours manqués.

In [71]:
new_messages = ['You won 200 billion dollars, call now rate', 'Hi','Someone has contacted our dating service and entered your phone because they fancy you! To find out who it is call from a landline 091110596124', 'Click here to confirm delivery www', 'join us today ! Flexible work without constainst. Earn between 50 and 3600 euros per day payement sent daily. To know more add : https:/wam.me/nawak5546431645']
X_new = vectorizer.transform(new_messages)
predictions = model.predict(X_new, )
predictions


array([1, 0, 1, 1, 0])

In [68]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

documents = df["text"]
labels = df["spam"] 
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)
model_ponderation = LinearSVC()
model_ponderation.fit(X, labels)
terms = vectorizer.get_feature_names_out()
coefficients = model_ponderation.coef_.flatten()
term_weights = pd.DataFrame({
    'term': terms,
    'weight': coefficients
})
term_weights_sorted = term_weights.sort_values(by='weight', ascending=False)
term_weights_sorted.head(30)


Unnamed: 0,term,weight
7986,txt,2.448137
8016,uk,2.023596
352,150p,1.831675
2067,claim,1.756522
8596,www,1.695623
616,50,1.65335
7644,text,1.625483
6794,service,1.619545
6525,ringtone,1.61721
5119,mobile,1.610383


In [69]:
term_weights_sorted[term_weights_sorted["term"] == "rate"]

Unnamed: 0,term,weight
6284,rate,0.926188


In [70]:
term_weights_sorted[term_weights_sorted["term"] == "earn"]

Unnamed: 0,term,weight
2837,earn,0.0
