# Bayesian learning for classifying news text articles

In [1]:
from os import listdir
from os.path import isfile, join
import string
import nltk
#nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\varsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = '20_newsgroups'

#Creating a list of folder names from which valid pathnames can be generated
folders = [f for f in listdir(data)]

In [3]:
#2D list
files = []
for folder_name in folders:
    folder_path = join(data, folder_name)
    files.append([f for f in listdir(folder_path)])

In [4]:
pathname_list = []
for fo in range(len(folders)):
    for fi in files[fo]:
        pathname_list.append(join(data, join(folders[fo], fi)))

In [5]:
len(pathname_list)

19997

In [6]:
Y = []
for folder_name in folders:
    folder_path = join(data, folder_name)
    num_of_files= len(listdir(folder_path))
    for i in range(num_of_files):
        Y.append(folder_name)

In [7]:
len(Y)

19997

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
doc_train, doc_test, Y_train, Y_test = train_test_split(pathname_list, Y, random_state=0, test_size=0.50)

In [10]:
stopwords= stopwords.words('english')
print(stopwords)


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
import string

def preprocess(words):
    # Remove tab characters
    words = [word.replace('\t', '') for word in words]
    
    # Remove punctuation (except for single quotes)
    punctuations = string.punctuation.replace("'", "")
    trans_table = str.maketrans('', '', punctuations)
    stripped_words = [word.translate(trans_table) for word in words]
    
    # Process single quotes and remove empty strings
    processed_words = []
    for word in stripped_words:
        if word:
            if word[0] == "'":
                word = word[1:]
            if word and word[-1] == "'":  # Check if word is not empty before accessing its last character
                word = word[:-1]
            processed_words.append(word)
        
    
    # Remove words that are only digits or have length less than 3
    cleaned_words = [word.lower() for word in processed_words if not (word.isdigit() or len(word) <= 2)]

    return cleaned_words


In [12]:
#function to remove stopwords

def remove_stopwords(words):
    words = [word for word in words if not word in stopwords]
    return words

In [13]:
def tokenize_sentence(line):
    from nltk import word_tokenize
    words = word_tokenize(line)
    words = preprocess(words)
    words = remove_stopwords(words)
    
    return words

In [14]:
#Removing metadata

def remove_metadata(lines):
    for i in range(len(lines)):
        if(lines[i] == '\n'):
            start = i+1
            break
    new_lines = lines[start:]
    return new_lines

In [15]:
def tokenize(path):
    f = open(path, 'r')
    text_lines = f.readlines()
    text_lines = remove_metadata(text_lines)
    doc_words = []
    for line in text_lines:
        doc_words.append(tokenize_sentence(line))

    return doc_words

In [16]:
def flatten(list):
    new_list = []
    for i in list:
        for j in i:
            new_list.append(j)
    return new_list

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Vectorize the training data
train_docs = []
for document in doc_train:
    with open(document, "r") as f:
        train_docs.append(f.read())

In [19]:
list_of_words = []

for document in doc_train:
        list_of_words.append(flatten(tokenize(document)))

In [20]:
import numpy as np
np_list_of_words = np.asarray(flatten(list_of_words))

In [21]:

words, counts = np.unique(np_list_of_words, return_counts=True)
len(words)

132550

In [22]:

freq, wrds = (list(i) for i in zip(*(sorted(zip(counts, words), reverse=True))))

In [23]:
f_o_w = []
n_o_w = []
for f in sorted(np.unique(freq), reverse=True):
    f_o_w.append(f)
    n_o_w.append(freq.count(f))

In [62]:
n=25000
features=wrds[0:n]

In [63]:
dictionary = {}
doc_num = 1
for doc_words in list_of_words:
    np_doc_words = np.asarray(doc_words)
    w, c = np.unique(np_doc_words, return_counts=True)
    dictionary[doc_num] = {}
    for i in range(len(w)):
        dictionary[doc_num][w[i]] = c[i]
    doc_num = doc_num + 1

In [64]:
X_train = []
for k in dictionary.keys():
    row = []
    for f in features:
        if(f in dictionary[k].keys()):
            row.append(dictionary[k][f]) 
        else:
            row.append(0)
    X_train.append(row)

In [65]:
X_train = np.asarray(X_train)
Y_train = np.asarray(Y_train)

In [66]:
list_of_words_test = []

for document in doc_test:
        list_of_words_test.append(flatten(tokenize(document)))

In [67]:
dictionary_test = {}
doc_num = 1
for doc_words in list_of_words_test:
    np_doc_words = np.asarray(doc_words)
    w, c = np.unique(np_doc_words, return_counts=True)
    dictionary_test[doc_num] = {}
    for i in range(len(w)):
        dictionary_test[doc_num][w[i]] = c[i]
    doc_num = doc_num + 1

In [68]:
X_test = []
for k in dictionary_test.keys():
    row = []
    for f in features:
        if(f in dictionary_test[k].keys()):
            row.append(dictionary_test[k][f]) 
        else:
            row.append(0)
    X_test.append(row)

In [83]:
X_test = np.asarray(X_test)
Y_test = np.asarray(Y_test)

In [82]:
def fit(X_train, Y_train):
    result = {}
    classes, counts = np.unique(Y_train, return_counts=True)
    
    for i in range(len(classes)):
        curr_class = classes[i]
        
        result["TOTAL_DATA"] = len(Y_train)
        result[curr_class] = {}
        
        X_tr_curr = X_train[Y_train == curr_class]
        
        num_features = n
        
        for j in range(num_features):
            result[curr_class][features[j]] = X_tr_curr[:,j].sum() 
                
        result[curr_class]["TOTAL_COUNT"] = counts[i]
    
    return result

In [85]:
def log_probablity(dictionary_train, x, curr_class):
    output = np.log(dictionary_train[curr_class]["TOTAL_COUNT"]) - np.log(dictionary_train["TOTAL_DATA"])
    num_words = len(x)
    for j in range(num_words):
        if(x[j] in dictionary_train[curr_class].keys()):
            xj = x[j]
            count_curr_class_equal_xj = dictionary_train[curr_class][xj] + 1
            count_curr_class = dictionary_train[curr_class]["TOTAL_COUNT"] + len(dictionary_train[curr_class].keys())
            curr_xj_prob = np.log(count_curr_class_equal_xj) - np.log(count_curr_class)
            output = output + curr_xj_prob
        else:
            continue
    
    return output

In [86]:
def predictSinglePoint(dictionary_train, x):
    classes = dictionary_train.keys()
    best_p = -10000
    best_class = -1
    for curr_class in classes:
        if(curr_class == "TOTAL_DATA"):
            continue
        p_curr_class = log_probablity(dictionary_train, x, curr_class)
        if(p_curr_class > best_p):
            best_p = p_curr_class
            best_class = curr_class
            
    return best_class

In [87]:
def predict(dictionary_train, X_test):
    Y_pred = []
    for x in X_test:
        y_predicted = predictSinglePoint(dictionary_train, x)
        Y_pred.append(y_predicted)
    
    return Y_pred

In [88]:
train_dictionary = fit(X_train, Y_train)

In [89]:
X_test = []

for key in dictionary_test.keys():
    X_test.append(list(dictionary_test[key].keys()))

In [90]:
my_predictions = predict(train_dictionary, X_test)

In [91]:
my_predictions = np.asarray(my_predictions)

In [92]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [93]:
accuracy_score(Y_test, my_predictions)

0.6645664566456646

In [94]:
print(classification_report(Y_test, my_predictions))

                          precision    recall  f1-score   support

                      -1       0.00      0.00      0.00         0
             alt.atheism       0.73      0.67      0.70       484
           comp.graphics       0.55      0.71      0.62       502
 comp.os.ms-windows.misc       0.87      0.38      0.53       504
comp.sys.ibm.pc.hardware       0.64      0.58      0.61       473
   comp.sys.mac.hardware       0.91      0.47      0.62       479
          comp.windows.x       0.69      0.79      0.74       509
            misc.forsale       0.88      0.40      0.55       501
               rec.autos       0.89      0.49      0.64       528
         rec.motorcycles       0.99      0.60      0.75       501
      rec.sport.baseball       0.99      0.71      0.82       501
        rec.sport.hockey       0.94      0.91      0.92       501
               sci.crypt       0.63      0.88      0.74       488
         sci.electronics       0.85      0.47      0.60       519
         