In [4]:
# import Necessary things 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


# Sample data
with open("file.txt", "r") as file:
    lines = file.readlines()
    
    messages = []
    test_data=[]
    for line in lines[:5001]:
        # Strip whitespace and split on the first tab character
        level, sentence = line.strip().split('\t', 1)
        messages.append((sentence, level))
    for line in lines[5001:]:
        level, sentence = line.strip().split('\t', 1)
        test_data.append((sentence, level))

# Load NLTK stop words
stop_words = set(stopwords.words('english'))

# Step 1: Tokenize words using NLTK
def tokenize(text):
    lst=[]
    for word in word_tokenize(text.lower()):
        if word.isalnum() and word not in stop_words:
            lst.append(word)
    return lst

# Step 2: Count word frequencies
def count_words(messages):
    spam_words = {}
    ham_words = {}
    spam_count = 0
    ham_count = 0

    for message, label in messages:
        words = tokenize(message)
        if label == "spam":
            spam_count += 1
            for word in words:
                if word not in spam_words:
                    spam_words[word] = 0
                spam_words[word] += 1
        else:
            ham_count += 1
            for word in words:
                if word not in ham_words:
                    ham_words[word] = 0
                ham_words[word] += 1

    return spam_words, ham_words, spam_count, ham_count

# Step 3: Calculate probabilities
def calculate_probabilities(spam_words, ham_words, spam_count, ham_count):
    total_spam_words = sum(spam_words.values())
    total_ham_words = sum(ham_words.values())
    
    # Total number of messages
    total_messages = spam_count + ham_count
    
    # P(spam) and P(ham)
    p_spam = spam_count / total_messages
    p_ham = ham_count / total_messages
    
    return total_spam_words, total_ham_words, p_spam, p_ham

# Step 4: Naive Bayes Classification with Laplace Smoothing
def classify(message, spam_words, ham_words, total_spam_words, total_ham_words, p_spam, p_ham):
    words = tokenize(message)
    
    # Initialize probabilities with prior probabilities
    spam_prob = p_spam
    ham_prob = p_ham
    
    # Laplace smoothing constant
    smoothing = 1.0
    
    # Total unique words for Laplace smoothing
    vocab_size = len(set(spam_words.keys()).union(set(ham_words.keys())))
    
    # Calculate the probability of each word being in spam or ham
    for word in words:
        # Calculate spam word probability with smoothing
        spam_word_freq = spam_words.get(word, 0) + smoothing
        spam_prob *= spam_word_freq / (total_spam_words + vocab_size * smoothing)
        
        # Calculate ham word probability with smoothing
        ham_word_freq = ham_words.get(word, 0) + smoothing
        ham_prob *= ham_word_freq / (total_ham_words + vocab_size * smoothing)
    
    # Classify based on the final probabilities
    return 1 if ham_prob > spam_prob else 0  # Return 1 for ham, 0 for spam




# Training the classifier
spam_words, ham_words, spam_count, ham_count = count_words(messages)
total_spam_words, total_ham_words, p_spam, p_ham = calculate_probabilities(spam_words, ham_words, spam_count, ham_count)

# test a new message from test data
# matrix have lists having actual value and predicted value
matrix=[]
for mess,level in test_data:
    if level=='ham':
        classification = classify(mess, spam_words, ham_words, total_spam_words, total_ham_words, p_spam, p_ham)
        if classification==1:
            matrix.append([1,1])
        else:
            matrix.append([1,0])
    if level=='spam':
        classification = classify(mess, spam_words, ham_words, total_spam_words, total_ham_words, p_spam, p_ham)
        if classification==1:
            matrix.append([0,1])
        else:
            matrix.append([0,0])

# Decision Making points about the modal
def getAnalyticsOfTest(matrix):
    
    TP_count=0
    TN_count=0
    FP_count=0
    FN_count=0

    #n=len(matrix)
    for i in matrix:
        if i[0]==1 and i[1]==1:
            TP_count+=1
        if i[0]==1 and i[1]==0:
            FN_count+=1
        if i[0]==0 and i[1]==1:
            FP_count+=1
        if i[0]==0 and i[1]==0:
            TN_count+=1
    accuracy = round((TP_count + TN_count) / (TP_count + TN_count + FP_count + FN_count)*100, 1) 
    recall = TP_count / (TP_count + FN_count)
    recall=round((recall*100),1)
    precision = round((TP_count / (TP_count + FP_count))*100, 1)
    F1_score = round((2 * recall * precision) / (recall + precision), 1)  # No additional * 100 here


    return ({"Accuracy %":accuracy,"TP":TP_count,"TN":TN_count,"FP":FP_count,"FN":FN_count,"recall %":recall,"precision %":precision,"F1_Score %":F1_score})

print(getAnalyticsOfTest(matrix)) 



{'Accuracy %': 97.9, 'TP': 490, 'TN': 71, 'FP': 3, 'FN': 9, 'recall %': 98.2, 'precision %': 99.4, 'F1_Score %': 98.8}
