In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import re
import pandas as pd
from sklearn.svm import LinearSVC

In [23]:
class DataExtraction(BaseEstimator, TransformerMixin):
    def __init__(self, features_names):
        self.features_names = features_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        features = {
            "taille_phrase":lambda x: len(x),
            "Nombre_mot":lambda x: len(x.split()),
            "email":lambda x: len(re.findall(r'\b(http|www)\S+', x)) > 0
        }
        retour = X
        for feature in self.features_names:
            retour[feature] = X["text"].apply(features[feature])
        retour = retour.drop(columns = "text")
        return retour


def GenerateModel(model,data,vectorizer=CountVectorizer(), features_names=["taille_phrase","Nombre_mot","email"]) :
    target = data["spam"]
    data = data.drop(columns=["spam"])
    featurePipe = Pipeline(steps=[("extraction feature",DataExtraction(features_names)),("inputing",SimpleImputer(strategy="mean")),("scaling",StandardScaler())])
    preparation = ColumnTransformer(transformers=
                              [("features",featurePipe,["text"]),
                               ("vectorisation",vectorizer,"text")]
                              )

    modelPipe = Pipeline(steps=[("prep données",preparation),("model",model)])
    modelPipe.fit(data,target)
    return modelPipe




In [24]:
df = pd.read_csv("BD1.txt",sep="\t",header=None,names=["spam","text"])
y1 = df["spam"]
X1 = df
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1 , test_size=0.2, random_state=42)


In [25]:
model = GenerateModel(LinearSVC(),X1_train)
print(classification_report(y1_test,model.predict(X1_test)))
print(model.predict(pd.DataFrame(['You won 200 billion dollars, call now!', 'Hi, how are you?'], columns=["text"])))
model

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       966
        spam       0.97      0.93      0.95       149

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

['ham' 'ham']




In [27]:
from sklearn.ensemble import RandomForestClassifier


model = GenerateModel(RandomForestClassifier(),X1_train, features_names=["taille_phrase"])
print(classification_report(y1_test,model.predict(X1_test)))
print(model.predict(pd.DataFrame(['You won 200 billion dollars, call now!', 'Hi, how are you?'], columns=["text"])))
model

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       1.00      0.87      0.93       149

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115

['ham' 'ham']
