In [104]:
import pandas as pd

pathDonnees = "../DataSetBrut/SMSSpamCollection.txt"
trainProportion = 0.2
df = pd.read_csv(
    pathDonnees, delimiter="\t", header=None, names=["spam", "text"]
)

In [105]:
sizeData = len(df.index)
trainSize = int(trainProportion * sizeData)
trainDF = df.sample(trainSize)
testDF = df.drop(trainDF.index)


In [106]:
trainDFSpam = trainDF[trainDF["spam"] == "spam"]
trainDFSpam["text"] = trainDFSpam["text"].str.lower()
dfSplitSpam = trainDFSpam.text.str.split(expand=True).stack().value_counts().to_frame()

dfSplitSpam.columns = ["count_spam"]

trainDFHam = trainDF[trainDF["spam"] == "ham"]
trainDFHam["text"] = trainDFHam["text"].str.lower()
dfSplitHam = trainDFHam.text.str.split(expand=True).stack().value_counts().to_frame()

dfSplitHam.columns = ["count_ham"]

total = dfSplitSpam.join(dfSplitHam, lsuffix='_spam', rsuffix='_ham')
total.fillna(0.001, inplace=True)

spamTotal = total["count_spam"].sum()
hamTotal = total["count_ham"].sum()
total["vraisemblance"] = (total["count_spam"] / spamTotal) / (total["count_ham"] / hamTotal)

trainDF["char_length"] = trainDF["text"].apply(len)
testDF["char_length"] = testDF["text"].apply(len)

avg_spam_length = trainDFSpam["text"].apply(len).mean()
avg_ham_length = trainDFHam["text"].apply(len).mean()

char_length_likelihood_spam = avg_spam_length / (avg_spam_length + avg_ham_length)
char_length_likelihood_ham = avg_ham_length / (avg_spam_length + avg_ham_length)

spamProbability = spamTotal / (spamTotal + hamTotal)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [107]:
total

Unnamed: 0,count_spam,count_ham,vraisemblance
to,117,347.000,0.805144
a,70,233.000,0.717397
call,68,49.000,3.313828
your,48,86.000,1.332785
you,41,334.000,0.293126
...,...,...,...
mind.,1,1.000,2.387906
o2fwd,1,0.001,2387.905666
18p/txt,1,0.001,2387.905666
4882,1,0.001,2387.905666


In [108]:
df = total.sort_values("vraisemblance", ascending=False).head(40)

In [109]:
from matplotlib.axis import YAxis
import plotly.express as px
import numpy as np
violin = total[(total["vraisemblance"] > 0) & (total["vraisemblance"] < 10)]
px.violin(violin, y="vraisemblance")

In [110]:
def TestPhraseList(phraseList, char_length, spamProba=1):
    for word in phraseList:
        vraisemblance = total["vraisemblance"].get(word, "NotFound")
        if vraisemblance != "NotFound":
            spamProba *= vraisemblance
    if char_length > avg_spam_length:
        spamProba *= char_length_likelihood_spam
    else:
        spamProba *= char_length_likelihood_ham
    return spamProba

def TestPhrase(phrase: str, spamProba=1):
    phraseList = phrase.split()
    char_length = len(phrase)
    return TestPhraseList(phraseList, char_length, spamProba=spamProba)

print(TestPhrase("Hi how are you? I have a wonderful proposition for you! Gain your wonderful prize by clicking on this link"))


99.67824489207344


In [111]:
testDF["text"] = testDF["text"].str.split()

def calculate_metrics():
    testDF["Finalresult"] = testDF["text"].apply(lambda x: "spam" if TestPhrase(" ".join(x)) > spamProbability else "ham")
    wrong = len(testDF[testDF["Finalresult"] != testDF["spam"].index])
    FP = len(testDF[(testDF["Finalresult"] == "spam") & (testDF["spam"] == "ham")].index)
    FN = len(testDF[(testDF["Finalresult"] == "ham") & (testDF["spam"] == "spam")].index)
    VP = len(testDF[(testDF["Finalresult"] == "ham") & (testDF["spam"] == "ham")].index)
    VN = len(testDF[(testDF["Finalresult"] == "spam") & (testDF["spam"] == "spam")].index)
    exactitude = (VN + VP) / (VN + FN + FP + VP)
    print(f"Exactitude : {exactitude}")
    rappel = VP / (VP + FN)
    print(f"Rappel : {rappel}")
    print(f"Rappel négatif : {VN / (VN + FP)}")
    precision = VP / (VP + FP)
    print(f"Précision : {precision}")
    print(f"Précision négative : {VN / (VN + FN)}")
    F1Score = (2 * precision * rappel) / (precision + rappel)
    print(f"F1-Score : {F1Score}")
    print(f"F1-Score négatif : {(2 * VN / (VN + FN) * VN / (VN + FP)) / (VN / (VN + FN) + VN / (VN + FP))}")

calculate_metrics()

Exactitude : 0.8568864961866308
Rappel : 0.9834237492465341
Rappel négatif : 0.4885964912280702
Précision : 0.8484139365574623
Précision négative : 0.9101307189542484
F1-Score : 0.9109436069235064
F1-Score négatif : 0.6358447488584476


## Validation croisée

In [112]:
import pandas as pd
from sklearn.model_selection import KFold

pathDonnees = "../DataSetBrut/SMSSpamCollection.txt"
trainProportion = 0.2
df = pd.read_csv(
    pathDonnees, delimiter="\t", header=None, names=["spam", "text"]
)

def train_and_evaluate(trainDF, testDF):
    trainDFSpam = trainDF[trainDF["spam"] == "spam"]
    trainDFSpam["text"] = trainDFSpam["text"].str.lower()
    dfSplitSpam = trainDFSpam.text.str.split(expand=True).stack().value_counts().to_frame()
    dfSplitSpam.columns = ["count_spam"]

    trainDFHam = trainDF[trainDF["spam"] == "ham"]
    trainDFHam["text"] = trainDFHam["text"].str.lower()
    dfSplitHam = trainDFHam.text.str.split(expand=True).stack().value_counts().to_frame()
    dfSplitHam.columns = ["count_ham"]

    total = dfSplitSpam.join(dfSplitHam, lsuffix='_spam', rsuffix='_ham')
    total.fillna(0.001, inplace=True)

    spamTotal = total["count_spam"].sum()
    hamTotal = total["count_ham"].sum()
    total["vraisemblance"] = (total["count_spam"] / spamTotal) / (total["count_ham"] / hamTotal)

    trainDF["char_length"] = trainDF["text"].apply(len)
    testDF["char_length"] = testDF["text"].apply(len)

    avg_spam_length = trainDFSpam["text"].apply(len).mean()
    avg_ham_length = trainDFHam["text"].apply(len).mean()

    char_length_likelihood_spam = avg_spam_length / (avg_spam_length + avg_ham_length)
    char_length_likelihood_ham = avg_ham_length / (avg_spam_length + avg_ham_length)

    spamProbability = spamTotal / (spamTotal + hamTotal)

    def TestPhraseList(phraseList, char_length, spamProba=1):
        for word in phraseList:
            vraisemblance = total["vraisemblance"].get(word, "NotFound")
            if vraisemblance != "NotFound":
                spamProba *= vraisemblance
        if char_length > avg_spam_length:
            spamProba *= char_length_likelihood_spam
        else:
            spamProba *= char_length_likelihood_ham
        return spamProba

    def TestPhrase(phrase: str, spamProba=1):
        phraseList = phrase.split()
        char_length = len(phrase)
        return TestPhraseList(phraseList, char_length, spamProba=spamProba)

    testDF["text"] = testDF["text"].str.split()

    def calculate_metrics():
        testDF["Finalresult"] = testDF["text"].apply(lambda x: "spam" if TestPhrase(" ".join(x)) > spamProbability else "ham")
        FP = len(testDF[(testDF["Finalresult"] == "spam") & (testDF["spam"] == "ham")].index)
        FN = len(testDF[(testDF["Finalresult"] == "ham") & (testDF["spam"] == "spam")].index)
        VP = len(testDF[(testDF["Finalresult"] == "ham") & (testDF["spam"] == "ham")].index)
        VN = len(testDF[(testDF["Finalresult"] == "spam") & (testDF["spam"] == "spam")].index)
        exactitude = (VN + VP) / (VN + FN + FP + VP)
        rappel = VP / (VP + FN)
        rappel_negatif = VN / (VN + FP)
        precision = VP / (VP + FP)
        precision_negatif = VN / (VN + FN)
        F1Score = (2 * precision * rappel) / (precision + rappel)
        F1Score_negatif = (2 * precision_negatif * rappel_negatif) / (precision_negatif + rappel_negatif)
        return exactitude, rappel, precision, F1Score, rappel_negatif, F1Score_negatif, precision_negatif

    return calculate_metrics()

kf = KFold(n_splits=5)
metrics = []

for train_index, test_index in kf.split(df):
    trainDF = df.iloc[train_index]
    testDF = df.iloc[test_index]
    metrics.append(train_and_evaluate(trainDF, testDF))

average_metrics = pd.DataFrame(metrics, columns=["Exactitude", "Rappel", "Précision", "F1-Score", "rappel_negatif", "F1Score_negatif", "precision_negatif"]).mean()
print("Moyenne des indicateurs :")
print(average_metrics)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Moyenne des indicateurs :
Exactitude           0.877420
Rappel               0.989153
Précision            0.868018
F1-Score             0.924613
rappel_negatif       0.523438
F1Score_negatif      0.671377
precision_negatif    0.937791
dtype: float64




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

