In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import re
import pandas as pd
from sklearn.svm import LinearSVC
from itertools import combinations

In [115]:
from math import e

from sklearn.preprocessing import MinMaxScaler


class DataExtraction(BaseEstimator, TransformerMixin):
    def __init__(self, features_list):
        self.features_list = features_list
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        retour = X
        for feature_name, feature_function in self.features_list.items():
            retour[feature_name] = X["text"].apply(feature_function)
        retour = retour.drop(columns = "text")
        return retour

def get_dictionnaire(string_list):
    features = {
        "taille_phrase":lambda x: len(x),
        "Nombre_mot":lambda x: len(x.split()),
        "email": lambda x: len(re.findall(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}', x)) > 0,
        "presence_monnaie": lambda x: 1 if re.search(r'[\$\€\£]', x) else 0 ,
        "presence telephone": lambda x: 1 if re.search(r'\b\d{10,}\b', x) else 0,
        "presence_caratere_speciaux": lambda x: 1 if re.search(r'[!@#$%^&*(),.?":{}|<>]', x) else 0,
        "proportion_majuscule": lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0,
        "presence_lien": lambda x: 1 if re.search(r'\b(http|www)\S+', x) else 0
    }
    if string_list == "all":
        return features
    elif string_list == "combination":
        # Generate all possible combinations
        all_combinations = []
        for r in range(1, len(features) + 1):
            combinations_r = list(combinations(features.keys(), r))
            all_combinations.extend(combinations_r)

        # Create a dictionary of all combinations
        combinations_dict = {}
        for i, combo in enumerate(all_combinations, 1):
            combinations_dict[f"combination_{i}"] = {k: features[k] for k in combo}
        return combinations_dict
    else:
        dict = {}
        try:
            for key in string_list:
                try:
                    dict[key] = features[key]
                except:
                    print(f"La fonction {key} n'existe pas dans le dictionnaire")
        except:
            print(f"La fonction demande une liste de string")
        return dict
            

def GenerateModel(features_names, model, data,vectorizer=TfidfVectorizer(stop_words="english"), scaler=StandardScaler()) :
    target = data["spam"]
    data = data.drop(columns=["spam"])
    featurePipe = Pipeline(steps=[("extraction feature",DataExtraction(features_names)),("inputing",SimpleImputer(strategy="mean")),("scaling",scaler)])
    preparation = ColumnTransformer(transformers=
                              [("features",featurePipe,["text"]),
                               ("vectorisation",vectorizer,"text")]
                              )

    modelPipe = Pipeline(steps=[("prep données",preparation),("model",model)])
    modelPipe.fit(data,target)
    return modelPipe




In [4]:
df = pd.read_csv("./DataSetBrut/BD1.txt",sep="\t",header=None,names=["spam","text"])
y1 = df["spam"]
X1 = df
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1 , test_size=0.2, random_state=42)


In [None]:
model = GenerateModel(model=LinearSVC(),data=X1_train, features_names=get_dictionnaire(["taille_phrase", "Nombre_mot", "email", "presence_monnaie", "presence telephone", "presence_caratere_speciaux", "proportion_majuscule", "presence_lien"]))
print(classification_report(y1_test,model.predict(X1_test)))
print(model.predict(pd.DataFrame(['You won 200 billion dollars, call now!', 'Hi, how are you?'], columns=["text"])))
model

In [None]:
from sklearn.ensemble import RandomForestClassifier


model = GenerateModel(model=RandomForestClassifier(),data=X1_train, features_names=get_dictionnaire("all"))
print(classification_report(y1_test,model.predict(X1_test)))
print(model.predict(pd.DataFrame(['You won 200 billion dollars, call now!', 'Hi, how are you?'], columns=["text"])))
model

## Création de modèle pour les 255 combinaisons de features

In [119]:
import time
from sklearn.base import clone

def extract_metrics(row):
    metrics = {}
    for class_name, values in row.items():
        if not isinstance(values, float):
            for metric, value in values.items():
                if class_name in ['ham', 'spam']:
                    metrics[f'{metric}_{class_name}'] = value
    return pd.Series(metrics)

def calculate_precisions_for_all_combinations(X_train, X_test, y_test, model=LinearSVC(), scaler=StandardScaler()):
    results_df = pd.DataFrame(columns=['combination', 'accuracy', 'time'])
    combinations_dict = get_dictionnaire("combination")
    for combination, dict in combinations_dict.items():
        start_time = time.time()
        # Create a fresh instance of the model for each iteration
        model_instance = clone(model)
        pipeline = GenerateModel(model=model_instance, data=X_train, features_names=dict, scaler=scaler)
        end_time = time.time()
        training_time = end_time - start_time
        accuracy = classification_report(y_test, pipeline.predict(X_test), output_dict=True)
        results_df = pd.concat([
            results_df, 
            pd.DataFrame({
                'combination': [list(dict.keys())], 
                'accuracy': [accuracy],
                'time': [training_time]
            })
        ])
    results_df.reset_index(drop=True, inplace=True)
    new_columns = results_df["accuracy"].apply(extract_metrics)
    df_precisions = pd.concat([results_df, new_columns], axis=1)
    return df_precisions[["combination", "accuracy", "time", "recall_ham", "recall_spam"]]



### Pour LinearSVC

In [2]:
from EvaluateModelsFeatures import calculate_precisions_for_all_combinations

In [7]:
model = LinearSVC()
df_linear = calculate_precisions_for_all_combinations(X1_train, y1_train,  X1_test, y1_test, model=model)

  results_df = pd.concat([


In [None]:
df_linear

### Pour Naive Bayes

In [None]:
from pandas import DataFrame
from sklearn.naive_bayes import MultinomialNB


model = MultinomialNB()
df_naive_bayes: DataFrame = calculate_precisions_for_all_combinations(X1_train,  X1_test, y1_test, model=model, scaler=MinMaxScaler())

In [None]:
df_naive_bayes

### Pour Regression logistique

In [None]:
from pandas import DataFrame
from sklearn.linear_model import LogisticRegression


model = LogisticRegression()
df_logistic_regression: DataFrame = calculate_precisions_for_all_combinations(X1_train,  X1_test, y1_test, model=model, scaler=MinMaxScaler())

In [None]:
df_logistic_regression

## Pour regression linéaire

In [12]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
# 91% de rappel spam avec ['taille_phrase', 'presence_monnaie', 'presence telephone', 'presence_caratere_speciaux', 'proportion_majuscule']
df_linear = calculate_precisions_for_all_combinations(X1_train, y1_train,  X1_test, y1_test, model=model)

  results_df = pd.concat([


In [13]:
df_linear

Unnamed: 0,combination,accuracy,time,recall_ham,recall_spam
0,[taille_phrase],"{'ham': {'precision': 0.9787234042553191, 'rec...",1.254804,1.000000,0.859060
1,[Nombre_mot],"{'ham': {'precision': 0.972809667673716, 'reca...",1.233293,1.000000,0.818792
2,[email],"{'ham': {'precision': 0.9777327935222672, 'rec...",1.266598,1.000000,0.852349
3,[presence_monnaie],"{'ham': {'precision': 0.9807106598984772, 'rec...",1.247652,1.000000,0.872483
4,[presence telephone],"{'ham': {'precision': 0.9836901121304791, 'rec...",1.183271,0.998965,0.892617
...,...,...,...,...,...
250,"[taille_phrase, Nombre_mot, email, presence te...","{'ham': {'precision': 0.9826883910386965, 'rec...",1.143664,0.998965,0.885906
251,"[taille_phrase, Nombre_mot, presence_monnaie, ...","{'ham': {'precision': 0.9817073170731707, 'rec...",1.051717,1.000000,0.879195
252,"[taille_phrase, email, presence_monnaie, prese...","{'ham': {'precision': 0.9826883910386965, 'rec...",1.099468,0.998965,0.885906
253,"[Nombre_mot, email, presence_monnaie, presence...","{'ham': {'precision': 0.9856996935648621, 'rec...",1.071999,0.998965,0.906040
