In [1]:
import os
import random
import numpy as np
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

dir= "D:/F/Machine Learning/MSML602/20_newsgroups"
check= "Subject"

data=[]
labels=[]

In [2]:
for folder_name in os.listdir(dir):
    folder_path = os.path.join(dir, folder_name)
    
    if os.path.isdir(folder_path):
        
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            
            with open(file_path, 'r') as f:
                
                lines = f.readlines()
                subject_index = next((i for i, line in enumerate(lines) if check in line), None)

                if subject_index is not None:
                    
                    content = "".join(lines[subject_index + 1:])
                    data.append(content)
                    labels.append(folder_name) 


In [3]:
train_data = []
train_labels = []
test_data = []
test_labels = []

class_data = defaultdict(list)

for content, label in zip(data, labels):
    class_data[label].append(content)
    
for label, contents in class_data.items():
    random.shuffle(contents)
    split_index = int(len(contents)) // 2

    train_data.extend(contents[:split_index])
    train_labels.extend([label] * split_index)
    test_data.extend(contents[split_index:])
    test_labels.extend([label] * (len(contents) - split_index))

In [4]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data)

#1D NumPy array containing frequency of each word
word_counts = X_train.sum(axis=0).A1
vocab = vectorizer.get_feature_names_out()
word_freq = list(zip(vocab, word_counts))
word_freq.sort(key=lambda x: x[1], reverse=True)

# Get the top 200 most frequent words
top_200_words = {word for word, _ in word_freq[:200]}
top_200_words_list=list(top_200_words)

In [5]:
filtered_vectorizer = TfidfVectorizer(stop_words=top_200_words_list)
X_train = filtered_vectorizer.fit_transform(train_data)
X_test = filtered_vectorizer.transform(test_data)

train_data_words = vectorizer.inverse_transform(X_train)
test_data_words = vectorizer.inverse_transform(X_test)

train_data_words = [list(doc) for doc in train_data_words]
test_data_words = [list(doc) for doc in test_data_words]

In [6]:
def train_naive_bayes(D, C):
    
    N_doc = len(D)  
    classes = set(C) 

    logprior = {}
    loglikelihood = defaultdict(lambda: defaultdict(float))

    bigdoc = defaultdict(list)  
    
    for doc, label in zip(D, C):
        bigdoc[label].extend(doc)

    V = set(word for words in bigdoc.values() for word in words)

    for c in classes:
        N_c = sum(1 for label in C if label == c)  # Documents in class c
        logprior[c] = np.log(N_c / N_doc)

    for c in classes:
        word_counts = Counter(bigdoc[c])  # Count words in class C
        total_count = sum(word_counts.values())  # Total words in class c

        # Calculate log likelihood with Laplace smoothing
        for w in V:
            loglikelihood[w][c] = np.log((word_counts[w] + 1) / (total_count + len(V)))

    return logprior, loglikelihood, V

def test_naive_bayes(test_data, logprior, loglikelihood, classes, V):
    predictions=[]
    
    for doc in test_data:
        sum_scores = defaultdict(float)  # Store scores for each class

        for c in classes:
            sum_scores[c] = logprior[c]

            for word in doc:
                if word in V: 
                    sum_scores[c] += loglikelihood[word][c]

        predictions.append(max(sum_scores, key=sum_scores.get))
    
    return predictions

In [8]:
classes=set(train_labels)
logprior, loglikelihood, vocab = train_naive_bayes(train_data_words, train_labels)

y_pred = test_naive_bayes(test_data_words, logprior, loglikelihood, classes, vocab)

correct_predictions = sum(1 for true, pred in zip(test_labels, y_pred) if true == pred)
accuracy = (correct_predictions / len(test_labels))*100

print(f"Average Prediction Accuracy: {accuracy:.2f}")

Average Prediction Accuracy: 84.13
