In [104]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import re
import pandas as pd
from sklearn.svm import LinearSVC
from itertools import combinations

In [105]:
from math import e

from sklearn.preprocessing import MinMaxScaler


class DataExtraction(BaseEstimator, TransformerMixin):
    def __init__(self, features_list):
        self.features_list = features_list
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        retour = X
        for feature_name, feature_function in self.features_list.items():
            retour[feature_name] = X["text"].apply(feature_function)
        retour = retour.drop(columns = "text")
        return retour

def get_dictionnaire(string_list):
    features = {
        "taille_phrase":lambda x: len(x),
        "Nombre_mot":lambda x: len(x.split()),
        "email": lambda x: len(re.findall(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}', x)) > 0,
        "presence_monnaie": lambda x: 1 if re.search(r'[\$\€\£]', x) else 0 ,
        "presence telephone": lambda x: 1 if re.search(r'\b\d{10,}\b', x) else 0,
        "presence_caratere_speciaux": lambda x: 1 if re.search(r'[!@#$%^&*(),.?":{}|<>]', x) else 0,
        "proportion_majuscule": lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0,
        "presence_lien": lambda x: 1 if re.search(r'\b(http|www)\S+', x) else 0
    }
    if string_list == "all":
        return features
    elif string_list == "combination":
        # Générer toutes les combinaisons possibles
        all_combinations = []
        for r in range(1, len(features) + 1):
            combinations_r = list(combinations(features.keys(), r))
            all_combinations.extend(combinations_r)

        # Créer un dictionnaire de toutes les combinaisons
        combinations_dict = {}
        for i, combo in enumerate(all_combinations, 1):
            combinations_dict[f"combination_{i}"] = {k: features[k] for k in combo}
        return combinations_dict
    else:
        dict = {}
        try:
            for key in string_list:
                try:
                    dict[key] = features[key]
                except:
                    print(f"La fonction {key} n'existe pas dans le dictionnaire")
        except:
            print(f"La fonction demande une liste de string")
        return dict
            

def GenerateModel(features_names, model, data,vectorizer=CountVectorizer(stop_words="english"), scaler=StandardScaler()) :
    target = data["spam"]
    data = data.drop(columns=["spam"])
    featurePipe = Pipeline(steps=[("extraction feature",DataExtraction(features_names)),("inputing",SimpleImputer(strategy="mean")),("scaling",scaler)])
    preparation = ColumnTransformer(transformers=
                              [("features",featurePipe,["text"]),
                               ("vectorisation",vectorizer,"text")]
                              )

    modelPipe = Pipeline(steps=[("prep données",preparation),("model",model)])
    modelPipe.fit(data,target)
    return modelPipe




In [106]:
df = pd.read_csv("BD1.txt",sep="\t",header=None,names=["spam","text"])
y1 = df["spam"]
X1 = df
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1 , test_size=0.2, random_state=42)


In [107]:
model = GenerateModel(model=LinearSVC(),data=X1_train, features_names=get_dictionnaire(["taille_phrase", "Nombre_mot", "email", "presence_monnaie", "presence telephone", "presence_caratere_speciaux", "proportion_majuscule", "presence_lien"]))
print(classification_report(y1_test,model.predict(X1_test)))
print(model.predict(pd.DataFrame(['You won 200 billion dollars, call now!', 'Hi, how are you?'], columns=["text"])))
model

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       966
        spam       0.99      0.93      0.96       149

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.98      1115
weighted avg       0.99      0.99      0.99      1115

['ham' 'ham']


In [108]:
from sklearn.ensemble import RandomForestClassifier


model = GenerateModel(model=RandomForestClassifier(),data=X1_train, features_names=get_dictionnaire(["taille_phrase"]))
print(classification_report(y1_test,model.predict(X1_test)))
print(model.predict(pd.DataFrame(['You won 200 billion dollars, call now!', 'Hi, how are you?'], columns=["text"])))
model

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       1.00      0.84      0.91       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

['ham' 'ham']


## Création de modèle pour les 255 combinaisons de features

In [109]:
import time
from sklearn.base import clone

def extract_metrics(row):
    metrics = {}
    for class_name, values in row.items():
        if not isinstance(values, float):
            for metric, value in values.items():
                if class_name in ['ham', 'spam']:
                    metrics[f'{metric}_{class_name}'] = value
    return pd.Series(metrics)

def calculate_precisions_for_all_combinations(X_train, X_test, y_test, model=LinearSVC(), scaler=StandardScaler()):
    results_df = pd.DataFrame(columns=['combination', 'accuracy', 'time'])
    combinations_dict = get_dictionnaire("combination")
    for combination, dict in combinations_dict.items():
        start_time = time.time()
        # Create a fresh instance of the model for each iteration
        model_instance = clone(model)
        pipeline = GenerateModel(model=model_instance, data=X_train, features_names=dict, scaler=scaler)
        end_time = time.time()
        training_time = end_time - start_time
        accuracy = classification_report(y_test, pipeline.predict(X_test), output_dict=True)
        results_df = pd.concat([
            results_df, 
            pd.DataFrame({
                'combination': [list(dict.keys())], 
                'accuracy': [accuracy],
                'time': [training_time]
            })
        ])
    results_df.reset_index(drop=True, inplace=True)
    new_columns = results_df["accuracy"].apply(extract_metrics)
    df_precisions = pd.concat([results_df, new_columns], axis=1)
    return df_precisions



### Pour LinearSVC

In [110]:
model = LinearSVC()
df_linear = calculate_precisions_for_all_combinations(X1_train,  X1_test, y1_test, model=model)

  results_df = pd.concat([


In [111]:
df_linear

Unnamed: 0,combination,accuracy,time,precision_ham,recall_ham,f1-score_ham,support_ham,precision_spam,recall_spam,f1-score_spam,support_spam
0,[taille_phrase],"{'ham': {'precision': 0.9857142857142858, 'rec...",0.113523,0.985714,1.000000,0.992806,966.0,1.000000,0.906040,0.950704,149.0
1,[Nombre_mot],"{'ham': {'precision': 0.9857142857142858, 'rec...",0.100098,0.985714,1.000000,0.992806,966.0,1.000000,0.906040,0.950704,149.0
2,[email],"{'ham': {'precision': 0.9847094801223242, 'rec...",0.097777,0.984709,1.000000,0.992296,966.0,1.000000,0.899329,0.946996,149.0
3,[presence_monnaie],"{'ham': {'precision': 0.9857142857142858, 'rec...",0.083925,0.985714,1.000000,0.992806,966.0,1.000000,0.906040,0.950704,149.0
4,[presence telephone],"{'ham': {'precision': 0.9867075664621677, 'rec...",0.118563,0.986708,0.998965,0.992798,966.0,0.992701,0.912752,0.951049,149.0
...,...,...,...,...,...,...,...,...,...,...,...
250,"[taille_phrase, Nombre_mot, email, presence te...","{'ham': {'precision': 0.9867075664621677, 'rec...",0.172554,0.986708,0.998965,0.992798,966.0,0.992701,0.912752,0.951049,149.0
251,"[taille_phrase, Nombre_mot, presence_monnaie, ...","{'ham': {'precision': 0.9887295081967213, 'rec...",0.177465,0.988730,0.998965,0.993821,966.0,0.992806,0.926174,0.958333,149.0
252,"[taille_phrase, email, presence_monnaie, prese...","{'ham': {'precision': 0.9877175025588536, 'rec...",0.159956,0.987718,0.998965,0.993309,966.0,0.992754,0.919463,0.954704,149.0
253,"[Nombre_mot, email, presence_monnaie, presence...","{'ham': {'precision': 0.9887295081967213, 'rec...",0.179569,0.988730,0.998965,0.993821,966.0,0.992806,0.926174,0.958333,149.0


### Pour Naive Bayes

In [112]:
from pandas import DataFrame
from sklearn.naive_bayes import MultinomialNB


model = MultinomialNB()
df_naive_bayes: DataFrame = calculate_precisions_for_all_combinations(X1_train,  X1_test, y1_test, model=model, scaler=MinMaxScaler())

  results_df = pd.concat([


In [113]:
df_naive_bayes

Unnamed: 0,combination,accuracy,time,precision_ham,recall_ham,f1-score_ham,support_ham,precision_spam,recall_spam,f1-score_spam,support_spam
0,[taille_phrase],"{'ham': {'precision': 0.9917440660474717, 'rec...",0.129971,0.991744,0.994824,0.993282,966.0,0.965753,0.946309,0.955932,149.0
1,[Nombre_mot],"{'ham': {'precision': 0.9917440660474717, 'rec...",0.078010,0.991744,0.994824,0.993282,966.0,0.965753,0.946309,0.955932,149.0
2,[email],"{'ham': {'precision': 0.9917440660474717, 'rec...",0.079775,0.991744,0.994824,0.993282,966.0,0.965753,0.946309,0.955932,149.0
3,[presence_monnaie],"{'ham': {'precision': 0.9927685950413223, 'rec...",0.079860,0.992769,0.994824,0.993795,966.0,0.965986,0.953020,0.959459,149.0
4,[presence telephone],"{'ham': {'precision': 0.9927611168562565, 'rec...",0.084781,0.992761,0.993789,0.993275,966.0,0.959459,0.953020,0.956229,149.0
...,...,...,...,...,...,...,...,...,...,...,...
250,"[taille_phrase, Nombre_mot, email, presence te...","{'ham': {'precision': 0.9927835051546392, 'rec...",0.152869,0.992784,0.996894,0.994835,966.0,0.979310,0.953020,0.965986,149.0
251,"[taille_phrase, Nombre_mot, presence_monnaie, ...","{'ham': {'precision': 0.9938080495356038, 'rec...",0.136157,0.993808,0.996894,0.995349,966.0,0.979452,0.959732,0.969492,149.0
252,"[taille_phrase, email, presence_monnaie, prese...","{'ham': {'precision': 0.9938080495356038, 'rec...",0.139192,0.993808,0.996894,0.995349,966.0,0.979452,0.959732,0.969492,149.0
253,"[Nombre_mot, email, presence_monnaie, presence...","{'ham': {'precision': 0.9938080495356038, 'rec...",0.134183,0.993808,0.996894,0.995349,966.0,0.979452,0.959732,0.969492,149.0
