In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import re
import pandas as pd
from sklearn.svm import LinearSVC

In [2]:
from math import e


class DataExtraction(BaseEstimator, TransformerMixin):
    def __init__(self, features_list):
        self.features_list = features_list
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        retour = X
        for feature_name, feature_function in self.features_list.items():
            retour[feature_name] = X["text"].apply(feature_function)
        retour = retour.drop(columns = "text")
        return retour

def get_dictionnaire(string_list):
    features = {
        "taille_phrase":lambda x: len(x),
        "Nombre_mot":lambda x: len(x.split()),
        "email":lambda x: len(re.findall(r'\b(http|www)\S+', x)) > 0,
        "presence_monnaie": lambda x: 1 if re.search(r'[\$\€\£]', x) else 0 ,
        "presence telephone": lambda x: 1 if re.search(r'\b\d{10,}\b', x) else 0,
        "presence_caratere_speciaux": lambda x: 1 if re.search(r'[!@#$%^&*(),.?":{}|<>]', x) else 0,
        "proportion_majuscule": lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0,
        "presence_lien": lambda x: 1 if re.search(r'\b(http|www)\S+', x) else 0
    }
    if string_list == "all":
        return features
    else:
        dict = {}
        try:
            for key in string_list:
                try:
                    dict[key] = features[key]
                except:
                    print(f"La fonction {key} n'existe pas dans le dictionnaire")
        except:
            print(f"La fonction demande une liste de string")
        return dict
            

def GenerateModel(features_names, model, data,vectorizer=CountVectorizer()) :
    target = data["spam"]
    data = data.drop(columns=["spam"])
    featurePipe = Pipeline(steps=[("extraction feature",DataExtraction(features_names)),("inputing",SimpleImputer(strategy="mean")),("scaling",StandardScaler())])
    preparation = ColumnTransformer(transformers=
                              [("features",featurePipe,["text"]),
                               ("vectorisation",vectorizer,"text")]
                              )

    modelPipe = Pipeline(steps=[("prep données",preparation),("model",model)])
    modelPipe.fit(data,target)
    return modelPipe




In [3]:
df = pd.read_csv("BD1.txt",sep="\t",header=None,names=["spam","text"])
y1 = df["spam"]
X1 = df
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1 , test_size=0.2, random_state=42)


In [4]:
model = GenerateModel(model=LinearSVC(),data=X1_train, features_names=get_dictionnaire(["taille_phrase", "Nombre_mot", "email", "presence_monnaie", "presence telephone", "presence_caratere_speciaux", "proportion_majuscule", "presence_lien"]))
print(classification_report(y1_test,model.predict(X1_test)))
print(model.predict(pd.DataFrame(['You won 200 billion dollars, call now!', 'Hi, how are you?'], columns=["text"])))
model



              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       966
        spam       0.99      0.93      0.96       149

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.98      1115
weighted avg       0.99      0.99      0.99      1115

['ham' 'ham']


In [5]:
from sklearn.ensemble import RandomForestClassifier


model = GenerateModel(model=RandomForestClassifier(),data=X1_train, features_names=get_dictionnaire(["taille_phrase"]))
print(classification_report(y1_test,model.predict(X1_test)))
print(model.predict(pd.DataFrame(['You won 200 billion dollars, call now!', 'Hi, how are you?'], columns=["text"])))
model

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       1.00      0.85      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

['ham' 'ham']


In [9]:
  from itertools import combinations

  features = {
      "taille_phrase": lambda x: len(x),
      "Nombre_mot": lambda x: len(x.split()),
      "email": lambda x: len(re.findall(r'\b(http|www)\S+', x)) > 0,
      "presence_monnaie": lambda x: 1 if re.search(r'[\$\€\£]', x) else 0,
      "presence telephone": lambda x: 1 if re.search(r'\b\d{10,}\b', x) else 0,
      "presence_caratere_speciaux": lambda x: 1 if re.search(r'[!@#$%^&*(),.?\":{}|<>]', x) else 0,
      "proportion_majuscule": lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0,
      "presence_lien": lambda x: 1 if re.search(r'\b(http|www)\S+', x) else 0
  }

  # Générer toutes les combinaisons possibles
  all_combinations = []
  for r in range(1, len(features) + 1):
      combinations_r = list(combinations(features.keys(), r))
      all_combinations.extend(combinations_r)

  # Créer un dictionnaire de toutes les combinaisons
  combinations_dict = {}
  for i, combo in enumerate(all_combinations, 1):
      combinations_dict[f"combination_{i}"] = {k: features[k] for k in combo}

  print(f"Nombre total de combinaisons: {len(all_combinations)}")
  print(combinations_dict)


Nombre total de combinaisons: 255
{'combination_1': {'taille_phrase': <function <lambda> at 0x0000029CE2F55080>}, 'combination_2': {'Nombre_mot': <function <lambda> at 0x0000029CE2F57880>}, 'combination_3': {'email': <function <lambda> at 0x0000029CE2F57A60>}, 'combination_4': {'presence_monnaie': <function <lambda> at 0x0000029CE2F56480>}, 'combination_5': {'presence telephone': <function <lambda> at 0x0000029CE2F56840>}, 'combination_6': {'presence_caratere_speciaux': <function <lambda> at 0x0000029CE2F563E0>}, 'combination_7': {'proportion_majuscule': <function <lambda> at 0x0000029CE2C854E0>}, 'combination_8': {'presence_lien': <function <lambda> at 0x0000029CE2C86CA0>}, 'combination_9': {'taille_phrase': <function <lambda> at 0x0000029CE2F55080>, 'Nombre_mot': <function <lambda> at 0x0000029CE2F57880>}, 'combination_10': {'taille_phrase': <function <lambda> at 0x0000029CE2F55080>, 'email': <function <lambda> at 0x0000029CE2F57A60>}, 'combination_11': {'taille_phrase': <function <l