In [24]:
import os
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import numpy as np
import pandas as pd
from pyorc import Reader
from pyorc.enums import StructRepr
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from unidecode import unidecode

nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/quentin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/quentin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [93]:
path = '/home/quentin/Dev/Spam-detector-pipeline/out'

files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(path):
    for file in f:
        if '.orc' in file:
            files.append(os.path.join(r, file))
messages = []
for f in files:
    file = open(f, "rb")
    reader = Reader(file, struct_repr=StructRepr.DICT)
    for message in reader:
        messages.append(message)
        
dataset = pd.DataFrame(messages)
dataset = dataset[dataset['language']=='fr']

# Remove documents of unknow languages
# dataset.drop(dataset[dataset['language']==''].index, axis=0, inplace=True)

dataset['text_with_lang'] = dataset[['language', 'text']].agg(' '.join, axis=1)

nb_messages = len(dataset)
print(f'Loaded {nb_messages} messages in the dataset')
nb_hams = len(dataset[dataset['isSpam']==0])
print(f'Number of non-spam messages: {nb_hams} ({nb_hams / nb_messages * 100}%)')
nb_spams = len(dataset[dataset['isSpam']==1])
print(f'Number of spam messages: {nb_spams} ({nb_spams / nb_messages * 100}%)')

Loaded 340 messages in the dataset
Number of non-spam messages: 208 (61.1764705882353%)
Number of spam messages: 132 (38.82352941176471%)


In [77]:
# We are going to stem the words of the most common languages of the dataset
languages = {
    'fr': 'french',
    'en': 'english',
    'ru': 'russian',
    'de': 'german'
}

def multi_language_tokenizer(string):
    tokens = word_tokenize(string)
    tokens = [t for t in tokens[1:] if len(t) > 2]
    
    language = languages[tokens[0]] if tokens[0] in languages else None
    
    if language:
        stop_words = stopwords.words(language)
        tokens = [t for t in tokens if t not in stop_words]
        
        stemmer = SnowballStemmer(language)
        tokens = [stemmer.stem(t) for t in tokens]
    
    return tokens
    

In [115]:
def save_my_url(string):
    return re.sub(r'(http://)?(www.)?petit-gite-langon.fr', ' petit gite langon ', string)

def replace_urls(string):
    return re.sub(r'(href=)?http\S*', 'URL', string)

def replace_prices(string):
    return re.sub(r'([\$]\s?[\d,.]*|[\d,.]*€)', 'PRICE', string)

def remove_special_characters(string):
    words = re.sub(r'\W+', ' ', string)
    return re.sub(r'(lt|gt)', '', words)

def replace_owner_name(string):
    return re.sub(r'(francoise|françoise|daniel|bontemps|dayot)', 'OWNER_NAME', string)

def replace_date(string):
    return re.sub(r'\d+/\d\d', 'DATE', string)

def replace_hours(string):
    return re.sub(r'\d+[h:](\d+)?', 'HOUR', string)

def replace_years(string):
    return re.sub(r'20\d\d', 'YEAR', string)

def replace_sex(string):
    return re.sub(r'sex', ' SEX ', string)

def clean_strings(string):
    return remove_special_characters(replace_date(replace_years(replace_hours(
        replace_urls(save_my_url(replace_prices(replace_owner_name(replace_sex(unidecode(string.lower()))))))))))

dataset['clean_words'] = dataset['text_with_lang'].apply(clean_strings)

In [130]:
tfidf = TfidfVectorizer(tokenizer=multi_language_tokenizer, max_df=0.5)
X = tfidf.fit_transform(dataset['clean_words'])
y = dataset['isSpam']

sum_words = X.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in tfidf.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
print(words_freq)

[('url', 22.38039851957462), ('nous', 21.204438841848305), ('sex', 20.85418348008098), ('votre', 19.203317178429977), ('vous', 19.100533287966915), ('dans', 17.38340805284847), ('ville', 15.194092526022718), ('les', 13.36017037676243), ('bonjour', 13.17388992110847), ('merci', 12.726843549797207), ('filles', 11.502778615054659), ('est', 10.871953597190812), ('cordialement', 10.210769986335649), ('owner_name', 10.036275112820572), ('une', 9.90840679123433), ('hour', 9.655407189427395), ('par', 9.565391155181041), ('gite', 8.164605375669975), ('que', 7.890731267650579), ('femmes', 7.202009625434649), ('bien', 7.0159666360047), ('des', 6.817361705375959), ('vers', 6.505463135339891), ('nuit', 6.385750955502104), ('3000', 6.082863088835974), ('gagner', 5.626699379752372), ('bonne', 5.610522376779133), ('sommes', 5.502745842406053), ('rencontrer', 5.459703102247516), ('comment', 5.429347390196353), ('meilleurs', 5.319309914630667), ('mon', 5.264234511425764), ('date', 5.167964771804834), ('

In [117]:
def train_model(model, X, y):
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.33, random_state=4)
    model.fit(Xtrain, Ytrain)
    print(f"Train score: {model.score(Xtrain, Ytrain)}.")
    print(f"Test score: {model.score(Xtest, Ytest)}.")

    return model
    
def analyze_model(model, dataset, vectorizer):
    X = tfidf.transform(dataset['clean_words'])
    dataset['prediction'] = model.predict(X)
    probabilities = model.predict_proba(X)
    dataset['proba_0'] = [proba[0] for proba in probabilities]
    dataset['proba_1'] = [proba[1] for proba in probabilities]
    
    false_positives = dataset[(dataset['prediction']==1) & (dataset['isSpam']==0)]
    predicted_spams = dataset[(dataset['prediction']==1)]
    nb_predicted_spam = len(predicted_spams)
    percentage_false_positives = len(false_positives) / nb_predicted_spam * 100 if nb_predicted_spam > 0 else 0
    print('Percentage of false positives: {} of {}.'.format(
        percentage_false_positives, 
        nb_predicted_spam
    ))
    
    false_negatives = dataset[(dataset['prediction']==0) & (dataset['isSpam']==1)]
    predicted_ham = dataset[(dataset['prediction']==0)]
    nb_predicted_ham = len(predicted_ham)
    percentage_false_negatives = len(false_negatives) / nb_predicted_ham * 100 if nb_predicted_ham > 0 else 0
    print('Percentage of false negatives: {} of {}.'.format(
        percentage_false_negatives,
        nb_predicted_ham
    ))

In [123]:
multinomialNB_model = train_model(MultinomialNB(), X, y)
analyze_model(multinomialNB_model, dataset, tfidf)

Train score: 0.9955947136563876.
Test score: 0.9734513274336283.
Percentage of false positives: 0.0 of 128.
Percentage of false negatives: 1.8867924528301887 of 212.


In [124]:
# ada_boost_model = train_model(AdaBoostClassifier(), X, y)
# analyze_model(ada_boost_model, dataset, tfidf)

In [131]:
cv = ShuffleSplit(n_splits=5, test_size=0.1, random_state=0)
scores = cross_val_score(MultinomialNB(), X, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.99 (+/- 0.02)


In [126]:
false_positives = dataset[(dataset['prediction']==1) & (dataset['isSpam']==0)][['clean_words', 'proba_0', 'proba_1']]
print(false_positives)

Empty DataFrame
Columns: [clean_words, proba_0, proba_1]
Index: []


In [132]:
test = "Bonjour madame Bontemps. Je serais intéressé par un échange de bannière avec votre site. Le mien est http://www.example.fr"
cleaned_test = clean_strings(test)

X = tfidf.transform([cleaned_test])
print(multinomialNB_model.predict(X))

def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
 
# show_most_informative_features(tfidf, multinomialNB_model)

[False]


In [96]:
validation_set = pd.read_csv("/home/quentin/Dev/Spam-detector-pipeline/validation.csv")
validation_set['text_with_lang'] = validation_set[['language', 'text']].agg(' '.join, axis=1)
validation_set['clean_words'] = validation_set['text_with_lang'].apply(clean_strings)
X = tfidf.transform([validation_set['clean_words']])
y = validation_set['isSpam']

analyze_model(ada_boost_model, validation_set, tfidf)

false_positives = validation_set[(validation_set['prediction']==1) & (validation_set['isSpam']==0)]['clean_words']
print(false_positives.values)

KeyError: "['language'] not in index"