In [1]:
import re, glob, random, math
from collections import defaultdict, Counter

In [2]:
def tokenize(message):
    message = message.lower()                   #Zamień na małe litery.
    all_words = re.findall("[a-z0-9]+",message) #Wyciągnij slowa.
    return set(all_words)                       #Usuń duplikaty.

def count_words(training_set):
    """zbior treningowy to para (message,is_spam)"""
    counts = defaultdict(lambda: [0,0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

def word_probabilities(counts,total_spams,total_non_spams,k=0.5):
    """zwrocenie 3-elementowej listy zawierajace slowo, prawdopodobienstwo wystapienia w spamie i prawdopodobienstwa nie bycia spamem"""
 
    return [(w,(spam +k)/(total_spams + 2 *k),
            (non_spam + k)/(total_non_spams +2 * k))
            for w,(spam,non_spam) in counts.items()]

def spam_probability(word_probs, message):
    """prawdopodbienstwo wystapienia slow w celu przypisania prawdopodobienstw do wiadomosci"""
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0

    for word, prob_if_spam, prob_if_not_spam in word_probs:
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)

    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [3]:

class NaiveBayesClassifier:
    def __init__(self,k=0.5):
        self.k = k
        self.word_probs = []
        
    def train(self,training_set):
        num_spams = len([is_spam for message, is_spam in training_set if is_spam])
        num_non_spams = len(training_set) - num_spams
        
        #Przetwórz treningowy zbiór danych
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts,num_spams,num_non_spams,self.k)
        
    def classify(self,message):
        return spam_probability(self.word_probs,message)

In [4]:
path = r"C:\Users\kamil\data_science\spam*\\*\\**"

data = []

for fn in glob.glob(path):
    is_spam = "ham" not in fn

    with open(fn,'r',encoding='ISO-8859-1') as file:
        for line in file:
            if line.startswith("Subject:"):
                subject = re.sub(r"^Subject: ","", line).strip()
                data.append((subject, is_spam))

#metoda z pliku machine_learning
def split_data(data, prob):
    """split data into fractions [prob, 1 - prob]"""
    results = [], []
    for row in data:
        results[0 if random.random() < prob else 1].append(row)
    return results

#dodatkowa metoda, ktora na podstawie twierdzenia bayesa oblicza prawdopodobienstwa spamu
def p_spam_given_word(word_prob):
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [5]:
#podzielenie zbioru na treningowy i testowy oraz zbudowanie klasyfikatora
random.seed(0)
train_data,test_data = split_data(data,0.8)

classifier = NaiveBayesClassifier()
classifier.train(train_data)

classified = [(subject, is_spam,classifier.classify(subject)) for subject,is_spam in test_data]
counts = Counter((is_spam,spam_probability > 0.5) for _,is_spam,spam_probability in classified)

classified.sort(key=lambda row: row[2])
#Największe prawdopodbieństwo spamu uzyskane wsród wiadomości niebedacych spamem

spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]

#Najmniejsze spamu uzyskane wsród wiadomości niebedacych spamem

hammiest_spams = list(filter(lambda row: row[1], classified))[:5]

words = sorted(classifier.word_probs, key=p_spam_given_word)

#najwieksze prawdopodbieństwo że wiadomość jest spamem
spammiest_words = words[-5:]
#Najwieksze prawdopodbieństwo że wiadomość nie jest spamem
hammiest_words = words[:5]

print("spammiest_hams:",'\n', spammiest_hams,'\n')
print("hammiest_spams",'\n', hammiest_spams,'\n')
print("spammiest_words",'\n', spammiest_words,'\n')
print("hammiest_words",'\n', hammiest_words)

spammiest_hams: 
 [('Buy Ryanair Travel Insurance Today', False, 0.9388516349179015), ('Automated 30 day renewal reminder 2002-05-27', False, 0.9518481976548178), ('Are bigger notebooks better?', False, 0.9535929614593901), ('"I meditated in a cave for 12 years and now I\'m here to tell you', False, 0.9838874066074335), ('FREE SHIPPING! No Minimum Purchase* at Buy.com', False, 0.9909761026638615)] 

hammiest_spams 
 [('Place Your LTC Declines with Us', True, 0.0056621575326712865), ('The Flight to Safety is Upon Us', True, 0.006312066148419511), ('Lease Deal', True, 0.007456610072298607), ('Looking for property in SPAIN?', True, 0.01072369258736404), ('Outstanding Opportunities for "Premier Producers"', True, 0.01114127099442387)] 

spammiest_words 
 [('zzzz', 0.026785714285714284, 0.00021872265966754156), ('sale', 0.029336734693877552, 0.00021872265966754156), ('money', 0.029336734693877552, 0.00021872265966754156), ('systemworks', 0.03188775510204082, 0.00021872265966754156), ('adv',