## calculating the priori & conditionals probability

In [13]:


# Import the necessary modules
import os
import string
import collections
import math
import numpy as np
import collections

# Define the vocabulary size and the smoothing parameter
V = 27 # 26 letters and space
alpha = 0.5 # additive smoothing with parameter 1/2

# Define the class labels and the data directories
classes = ['e', 'j', 's'] # English, Japanese, and Spanish
data_dir = r'C:\Users\Subham Sabud\OneDrive\Desktop\python ece760\hw4_ece760_uw_madison\languageID'
#data_dir = 'C:\\Users\\Subham Sabud\\OneDrive\\Desktop\\python ece760hw4_ece760_uw_madison\\languageID\\languageID' # the directory where the data files are stored

# Define a function to preprocess a text file
def preprocess(file):
    # Open the file and read its content
    with open(file, 'r') as f:
        text = f.readline().rstrip("\n")
    
    # Convert all characters to lower case and remove non-printable characters
    text = text.lower()
    text = ''.join(c for c in text if c in string.printable)
    
    # Count the frequency of each character and return a counter object
    counter = collections.Counter(text)
    return counter

# Define a function to split the data into training and test sets
def split_data(data_dir, classes):
    # Initialize empty lists to store the training and test data
    train_data = []
    test_data = []
    
    # Loop through each class label and each data file
    for c in classes:
        # Get the list of files in the subdirectory for that class
        files = os.listdir(os.path.join(data_dir, c))
        
        # Loop through the first 10 files (0.txt to 9.txt) and append them to the training data list
        for i in range(10):
            file = os.path.join(data_dir, c, files[i])
            train_data.append((preprocess(file), c))
        
        # Loop through the remaining 10 files (10.txt to 19.txt) and append them to the test data list
        for i in range(10, 20):
            file = os.path.join(data_dir, c, files[i])
            test_data.append((preprocess(file), c))
    
    # Return the training and test data lists
    return train_data, test_data

# Define a function to estimate the prior probabilities for each class
def estimate_priors(train_data, classes, alpha):
    # Initialize an empty dictionary to store the prior probabilities
    priors = {}
    
    # Get the total number of documents in the training data
    N = len(train_data)
    
    # Get the number of classes
    C = len(classes)
    
    # Loop through each class label
    for c in classes:
        # Count the number of documents in that class
        N_c = sum(1 for x, y in train_data if y == c)
        
        # Apply additive smoothing and calculate the logarithm of the prior probability
        priors[c] = math.log((N_c + alpha) / (N + alpha * C))
        #priors[c] = ((N_c + alpha) / (N + alpha * C))
    # Return the prior probabilities dictionary
    return priors


# Define a function to estimate the conditional probabilities for each character given each class
def estimate_conditionals(train_data, classes, V, alpha):
    # Initialize an empty dictionary to store the conditional probabilities
    conditionals = {}
    
    # Loop through each class label
    for c in classes:
        # Initialize an empty dictionary to store the conditional probabilities for that class
        conditionals[c] = {}
        
        # Get the total number of characters in documents of that class
        n_c = sum(sum(x.values()) for x, y in train_data if y == c)
        #n_c = sum(1 for x, y in train_data if y == c)
        # Loop through each character in the vocabulary
        for x_i in string.ascii_lowercase + ' ' :
            # Count the number of times that character appears in documents of that class
            n_ic = sum(x[x_i] for x, y in train_data if y == c)
            
            # Apply additive smoothing and calculate the logarithm of the conditional probability
            conditionals[c][x_i] = math.log((n_ic + alpha) / (n_c + alpha * V))
    
    # Return the conditional probabilities dictionary
    return conditionals

# Define a function to classify a test document using the Naive Bayes rule
def classify(test_doc, priors, conditionals, classes):
    # Initialize an empty dictionary to store the scores for each class
    scores = {}
    
    # Loop through each class label
    for c in classes:
        # Initialize the score with the prior probability of that class
        scores[c] = 0
        
        # Loop through each character in the test document
        for x_i in test_doc.keys():
            # Add the logarithm of the conditional probability of that character given that class to the score
            scores[c] += priors[c]+ conditionals[c][x_i] * test_doc[x_i]
    
    # Find the class that maximizes the score
    pred_class = max(scores, key=scores.get)
    
    # Return the predicted class label
    return pred_class

# Split the data into training and test sets
train_data, test_data = split_data(data_dir, classes)

# Estimate the prior probabilities for each class
priors = estimate_priors(train_data, classes, alpha)

# Estimate the conditional probabilities for each character given each class
conditionals = estimate_conditionals(train_data, classes, V, alpha)

# Initialize an empty list to store the predictions
predictions = []

# Loop through each test document and its true class label
for test_doc, true_class in test_data:
    # Classify the test document using the Naive Bayes rule
    pred_class = classify(test_doc, priors, conditionals, classes)
    
    # Append the predicted class label to the predictions list
    predictions.append(pred_class)

# Calculate the accuracy of the predictions
accuracy = sum(1 for i in range(len(test_data)) if predictions[i] == test_data[i][1]) / len(test_data)

# Print the accuracy
#print("Accuracy of the character-based Naive Bayes classifier =", round(accuracy, 2))

# Print the prior probabilities
print("Prior probabilities:")
for c in classes:
    print(c, "=", round(math.exp(priors[c]), 4))

# Print the conditional probabilities for English
print("Conditional probabilities for English:")
theta_e = []
for x_i in string.ascii_lowercase + ' ':
    theta_e.append(math.exp(conditionals['e'][x_i]))
print(theta_e)

# Print the conditional probabilities for Spanish
print("Conditional probabilities for Spanish:")
theta_s = []
for x_i in string.ascii_lowercase + ' ':
    theta_s.append(math.exp(conditionals['s'][x_i]))
print(theta_s)

# Print the conditional probabilities for Japanese
print("Conditional probabilities for Japanese:")
theta_j = []
for x_i in string.ascii_lowercase + ' ':
    theta_j.append(math.exp(conditionals['j'][x_i]))
print(theta_j)

#file = "e10.txt"
#test_doc=file
# Classify the test document using the Naive Bayes rule
#pred_class = classify(test_doc, priors, conditionals, classes)


Prior probabilities:
e = 0.3333
j = 0.3333
s = 0.3333
Conditional probabilities for English:
[0.06765523632993511, 0.007599629286376275, 0.01723818350324375, 0.022428174235403147, 0.10620945319740499, 0.016867469879518066, 0.009823911028730302, 0.05690454124189064, 0.05060240963855422, 0.0005560704355885081, 0.0009267840593141799, 0.03021316033364226, 0.020203892493049123, 0.05430954587581093, 0.05542168674698795, 0.01723818350324375, 0.0005560704355885081, 0.045041705282669146, 0.06728452270620947, 0.09249304911955515, 0.025023169601482858, 0.008341056533827617, 0.013901760889712692, 0.002038924930491196, 0.016496756255792396, 0.0005560704355885081, 0.19406858202038926]
Conditional probabilities for Spanish:
[0.09113817723645527, 0.011339773204535913, 0.026459470810583793, 0.04157916841663167, 0.10625787484250314, 0.008819823603527928, 0.007139857202855945, 0.007979840403191937, 0.0499790004199916, 0.004619907601847965, 0.002099958000839984, 0.046619067618647626, 0.015539689206215871,

In [8]:
# Define the file name
file = "e10.txt"

# Define a function to preprocess a text file
def preprocess(file):
    # Open the file and read its content
    with open(file, "r") as f:
       text = f.readline().rstrip("\n")
    
    # Convert all characters to lower case and remove non-printable characters
    text = text.lower()
    text = "".join(c for c in text if c in string.printable)
    
    # Count the frequency of each character and return a counter object
    counter = collections.Counter(text)
    return counter

# Preprocess the text file and get the counter object
counter = preprocess(file)

# Print the counter object
print(counter)



#To create the bag-of-words vector x for e10.txt, we need to initialize a vector of length 27 with all zeros 
#and then assign the values from the counter object according to the vocabulary order. Here is a possible code that does that:

# Import numpy to create a vector
import numpy as np

# Define the vocabulary of 27 characters
vocabulary = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ']
# Initialize a vector of length 27 with all zeros
x = np.zeros(27)

# Loop through each character in the vocabulary
for i, c in enumerate(vocabulary):
    # Assign the value from the counter object to the corresponding position in the vector
    x[i] = counter[c]

# Print the vector x
print("The bag-of-words count vector x for e10.txt is:")
print(x)

Counter({' ': 107, 'e': 69, 't': 52, 's': 44, 'a': 34, 'o': 33, 'h': 32, 'n': 30, 'r': 28, 'i': 26, 'p': 14, 'w': 12, 'm': 12, 'd': 10, 'l': 10, 'u': 10, 'f': 10, 'c': 9, 'v': 8, 'y': 7, 'g': 6, 'x': 2, 'b': 2, 'q': 1, 'z': 1})
The bag-of-words count vector x for e10.txt is:
[ 34.   2.   9.  10.  69.  10.   6.  32.  26.   0.   0.  10.  12.  30.
  33.  14.   1.  28.  44.  52.  10.   8.  12.   2.   7.   1. 107.]


In [14]:
#for c in classes:
loglikelihood_e = loglikelihood_j=loglikelihood_s=0
for i in range (len(x)):
    loglikelihood_e += x[i]* math.log(theta_e[i]);
    loglikelihood_j += x[i]* math.log(theta_j[i]);
    loglikelihood_s += x[i]* math.log(theta_s[i]);


print("The loglikelihood probability for English is:")
print(loglikelihood_e)
print("The loglikelihood probability for Japanese is:")
print(loglikelihood_j)
print("The loglikelihood probability for Spanish is:")
print(loglikelihood_s)



The loglikelihood probability for English is:
-1577.3264186291008
The loglikelihood probability for Japanese is:
-1800.1283175527014
The loglikelihood probability for Spanish is:
-1710.1689804601124


In [97]:

 
    
 
   
  

# Import pandas to create a confusion matrix
import pandas as pd

# Create a data frame with the true and predicted class labels
df = pd.DataFrame({'y_actual': [y for x, y in test_data], 'y_predicted': predictions})

# Create a confusion matrix using pandas crosstab function
confusion_matrix = pd.crosstab(df['y_predicted'], df['y_actual'], rownames=['Predicted'], colnames=['actual'])

# Print the confusion matrix
print(confusion_matrix)



actual      e  j  s
Predicted          
e          10  4  5
j           0  6  0
s           0  0  5
