In [2]:
import pandas as pd
import plotly.express as px
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB

# Import des données


In [3]:
df = pd.read_csv(
    "SMSSpamCollection.txt", delimiter="\t", header=None, names=["spam", "text"]
)
df

Unnamed: 0,spam,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


On cherche à modéliser la probabilité qu'un message soit un spam à partir de son contenu. Il nous faut donc un modèle de classification binaire. On peux donc utiliser un modèle linaire SVC, ou un naive bayes.

# Prétraitement des données

Je vais d'abord binariser la variable cible, sous forme d'un entier (0 ou 1).

In [4]:
df["spam"] = df["spam"].map({'spam': 1, 'ham': 0}).astype(int)


In [5]:
df

Unnamed: 0,spam,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


# Approche par constructions de feature descriptives des messages

## Création des features

On vérifie la présence d'un numéro de téléphone.

In [6]:
df['has_phone_number'] = df['text'].apply(lambda x: 1 if re.search(r'\b\d{10,}\b', x) else 0)
df

Unnamed: 0,spam,text,has_phone_number
0,0,"Go until jurong point, crazy.. Available only ...",0
1,0,Ok lar... Joking wif u oni...,0
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,0,U dun say so early hor... U c already then say...,0
4,0,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,1
5568,0,Will ü b going to esplanade fr home?,0
5569,0,"Pity, * was in mood for that. So...any other s...",0
5570,0,The guy did some bitching but I acted like i'd...,0


On vérifie la présence de monnaies.

In [7]:
df['has_currency_symbol'] = df['text'].apply(lambda x: 1 if re.search(r'[\$\€\£]', x) else 0)
df

Unnamed: 0,spam,text,has_phone_number,has_currency_symbol
0,0,"Go until jurong point, crazy.. Available only ...",0,0
1,0,Ok lar... Joking wif u oni...,0,0
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,0,0
3,0,U dun say so early hor... U c already then say...,0,0
4,0,"Nah I don't think he goes to usf, he lives aro...",0,0
...,...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,1,1
5568,0,Will ü b going to esplanade fr home?,0,0
5569,0,"Pity, * was in mood for that. So...any other s...",0,0
5570,0,The guy did some bitching but I acted like i'd...,0,0


In [8]:
df[df["spam"] == 1]["text"].iloc[0]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

On calcule la longueur des messages.

In [9]:
df['message_length'] = df['text'].apply(len)
df

Unnamed: 0,spam,text,has_phone_number,has_currency_symbol,message_length
0,0,"Go until jurong point, crazy.. Available only ...",0,0,111
1,0,Ok lar... Joking wif u oni...,0,0,29
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,0,0,155
3,0,U dun say so early hor... U c already then say...,0,0,49
4,0,"Nah I don't think he goes to usf, he lives aro...",0,0,61
...,...,...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,1,1,160
5568,0,Will ü b going to esplanade fr home?,0,0,36
5569,0,"Pity, * was in mood for that. So...any other s...",0,0,57
5570,0,The guy did some bitching but I acted like i'd...,0,0,125


## Entrainement et évaluation du modèle

In [10]:

X = df[['message_length', 'has_phone_number', 'has_currency_symbol']]
y = df['spam']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.94      1.00      0.97       966
           1       0.97      0.61      0.75       149

    accuracy                           0.95      1115
   macro avg       0.96      0.80      0.86      1115
weighted avg       0.95      0.95      0.94      1115



# Approche par Vectorisation de text (NLP)

Il faut donc convertir le texte en une représentation numérique. J'utilise pour cela TfidfVectorizer.  C'est une technique de pondération des mots utilisée dans la récupération d'informations et le traitement du langage naturel (NLP). Elle évalue l'importance d'un mot dans un document par rapport à une collection de documents (corpus).
- TF (Term Frequency) : Nombre de fois qu'un mot apparaît dans un document, divisé par le nombre total de mots dans ce document.
- IDF (Inverse Document Frequency) : Mesure l'importance d'un mot en tenant compte de sa fréquence dans tout le corpus. Moins un mot est fréquent dans les autres documents, plus sa valeur IDF est élevée.

In [11]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])
y = df['spam']


In [12]:
df['has_phone_number'] = df['text'].apply(lambda x: 1 if re.search(r'\b\d{10,}\b', x) else 0)
df

Unnamed: 0,spam,text,has_phone_number,has_currency_symbol,message_length
0,0,"Go until jurong point, crazy.. Available only ...",0,0,111
1,0,Ok lar... Joking wif u oni...,0,0,29
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,0,0,155
3,0,U dun say so early hor... U c already then say...,0,0,49
4,0,"Nah I don't think he goes to usf, he lives aro...",0,0,61
...,...,...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,1,1,160
5568,0,Will ü b going to esplanade fr home?,0,0,36
5569,0,"Pity, * was in mood for that. So...any other s...",0,0,57
5570,0,The guy did some bitching but I acted like i'd...,0,0,125


In [31]:
pd.concat([pd.DataFrame(X.toarray()), df["message_length"]], axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8704,8705,8706,8707,8708,8709,8710,8711,8712,message_length
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,111
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,29
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,155
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,49
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,160
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,36
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,57
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,125


## Utilisation du modèle naive bayes

On sépare les données d'entrainement et de tests.

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


On entraine le modèle.

In [33]:
model = MultinomialNB()
model.fit(X_train, y_train)


On évalue le modèle sur les données de test.

In [34]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9856502242152466
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.94      0.95      0.95       149

    accuracy                           0.99      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



Avec la méthode de cross validation : 

In [35]:
print(cross_val_score(model, X, y, cv=3))

[0.98385361 0.97469036 0.9811524 ]


Le modèle est assez performant mais 25% des vrais spams n'ont pas été détectés (faux négatifs), donc non suffisant.

## Utilisation du modèle linéraire SVC

On sépare les données d'entrainement et de tests.

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


On entraine le modèle.

In [37]:
model = LinearSVC()
model.fit(X_train, y_train)


On évalue le modèle sur les données de test.

In [38]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9901345291479821
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       1.00      0.93      0.96       149

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.98      1115
weighted avg       0.99      0.99      0.99      1115



Avec la méthode de cross validation :

In [39]:
print(cross_val_score(model, X, y, cv=3))

[0.98385361 0.98384491 0.98276791]


On obtient une précision plutot élevée, avec une accuracy de 98%, même si certains spams sont toujours manqués.

In [40]:
new_messages = ['You won 200 billion dollars, call now!', 'Hi, how are you?']
X_new = vectorizer.transform(new_messages)
predictions = model.predict(X_new)
print(predictions)


[1 0]


In [41]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

documents = df["text"]
labels = df["spam"] 
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)
model = LinearSVC()
model.fit(X, labels)
terms = vectorizer.get_feature_names_out()
coefficients = model.coef_.flatten()
term_weights = pd.DataFrame({
    'term': terms,
    'weight': coefficients
})
term_weights_sorted = term_weights.sort_values(by='weight', ascending=False)
term_weights_sorted.head(30)


Unnamed: 0,term,weight
346,146tf150p,1.438561
6526,ringtoneking,0.863136
776,84484,0.863136
6525,ringtone,0.846111
25,07090201529,0.701953
8016,uk,0.670269
802,88066,0.652444
7327,stories,0.616624
3219,filthy,0.616624
7986,txt,0.583219


In [42]:
import plotly.express as px
px.violin(term_weights_sorted[(term_weights_sorted["weight"] > -0.5) & (term_weights_sorted["weight"] < 0.5)], y="weight")

In [43]:
term_weights_sorted[term_weights_sorted["term"] == "rate"]

Unnamed: 0,term,weight
6284,rate,0.338979
