<a href="https://colab.research.google.com/github/Vukhmt02/naive-bayes-project/blob/main/codethurac.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
sms = pd.read_csv('/SMSSpamCollection', sep='\t', header=None, names=['Label', 'SMS'])

print("Kích thước dữ liệu:", sms.shape)
print(sms.head())
print(sms['Label'].value_counts(normalize=True))

Kích thước dữ liệu: (5572, 2)
  Label                                                SMS
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
Label
ham     0.865937
spam    0.134063
Name: proportion, dtype: float64


In [None]:
# Xáo trộn dữ liệu
data_random = sms.sample(frac=1, random_state=1)

# Tính chỉ số phân chia
index = round(len(data_random) * 0.8)

# Tách tập huấn luyện và kiểm tra
huanluyen = data_random[:index].reset_index(drop=True)
test = data_random[index:].reset_index(drop=True)

# Thống kê
print(huanluyen.shape)
print(test.shape)
print(huanluyen['Label'].value_counts(normalize=True))
print(test['Label'].value_counts(normalize=True))


(4458, 2)
(1114, 2)
Label
ham     0.86541
spam    0.13459
Name: proportion, dtype: float64
Label
ham     0.868043
spam    0.131957
Name: proportion, dtype: float64


In [None]:
# Trước khi làm sạch
print(huanluyen.head(3))

# Làm sạch: loại bỏ ký tự không phải chữ cái, chuyển về chữ thường, tách từ
huanluyen['SMS'] = huanluyen['SMS'].str.replace(r'\W', ' ')
huanluyen['SMS'] = huanluyen['SMS'].str.lower()
huanluyen['SMS'] = huanluyen['SMS'].str.split()

  Label                                            SMS
0   ham                   Yep, by the pretty sculpture
1   ham  Yes, princess. Are you going to make me moan?
2   ham                     Welp apparently he retired


In [None]:
# Tạo danh sách từ vựng duy nhất
vocab = []
for sms in huanluyen['SMS']:
    for word in sms:
        vocab.append(word)
    vocab = list(set(vocab))
print('Số lượng từ vựng:', len(vocab))

Số lượng từ vựng: 11860


In [None]:
number_appear = {unique_word: [0] * len(huanluyen) for unique_word in vocab}
for index, sms in enumerate(huanluyen['SMS']):
    for word in sms:
        number_appear[word][index] += 1

word_counts = pd.DataFrame(number_appear)
train_extend = pd.concat([huanluyen, word_counts], axis=1)
train_extend.head()

Unnamed: 0,Label,SMS,maangalyam,bar,expect,madstini,incident,recharged,onwards,60,...,tonite,atm,steam,billed,stupid,want,px3748,term,pthis,anymore
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
# Chia tập theo nhãn
spam_messages = train_extend[train_extend['Label'] == 'spam']
ham_messages = train_extend[train_extend['Label'] == 'ham']

# Tính xác suất P(spam) và P(ham)
p_spam = len(spam_messages) / len(train_extend)
p_ham = len(ham_messages) / len(train_extend)

# Tổng số từ trong spam/ham
n_spam = spam_messages['SMS'].apply(len).sum()
n_ham = ham_messages['SMS'].apply(len).sum()
n_vocab = len(vocab)
alpha = 1  # Laplace smoothing

# Tính xác suất có điều kiện P(word|spam) và P(word|ham)
parameters_spam = {word: 0 for word in vocab}
parameters_ham = {word: 0 for word in vocab}

for word in vocab:
    parameters_spam[word] = (spam_messages[word].sum() + alpha) / (n_spam + alpha * n_vocab)
    parameters_ham[word] = (ham_messages[word].sum() + alpha) / (n_ham + alpha * n_vocab)


In [None]:
def classify(message):
    message = re.sub(r'\W', ' ', message)
    message = message.lower().split()
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]
    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)
    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_spam_given_message > p_ham_given_message:
        print('Label: Spam')
    else:
        print('Xác suất bằng nhau!')

In [None]:
classify("You won a motorbike! Claim now at http://abc.xyz")
classify("Your account received $20. Balance: $90.")

P(Spam|message): 1.302097874551174e-18
P(Ham|message): 2.8771317212829296e-22
Label: Spam
P(Spam|message): 7.866056564070012e-18
P(Ham|message): 3.608121177544094e-19
Label: Spam


In [None]:
def classify_test(message):
    message = re.sub(r'\W', ' ', message)
    message = message.lower().split()
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs human classification'

# Áp dụng phân loại
test['predicted'] = test['SMS'].apply(classify_test)
test.head(10)


Unnamed: 0,Label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham
5,ham,But my family not responding for anything. Now...,ham
6,ham,U too...,ham
7,ham,Boo what time u get out? U were supposed to ta...,ham
8,ham,Genius what's up. How your brother. Pls send h...,ham
9,ham,I liked the new mobile,ham


In [None]:
correct = 0
total = test.shape[0]
for row in test.iterrows():
    row = row[1]
    if row['Label'] == row['predicted']:
        correct += 1

print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct / total)


Correct: 1100
Incorrect: 14
Accuracy: 0.9874326750448833
