In [1]:
import tarfile

# Open the .tgz file
with tarfile.open('languageID.tgz', 'r:gz') as tar:
    # Extract all files in the archive
    tar.extractall()

In [1]:
import os
import numpy as np
from collections import Counter

# Define the vocabulary of characters
vocabulary = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
              'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ']

# Define the number of classes
num_classes = 3

# Define the smoothing parameter
alpha = 1/2

# Load the dataset
dataset_dir = 'languageID'
files = os.listdir(dataset_dir)
X_train = []
y_train = []
X_test = []
y_test = []
for filename in files:
    with open(os.path.join(dataset_dir, filename), 'r') as f:
        text = f.read().lower()
        
        #use the number 0-9 file as training data 
        if(len(filename) == 6):
            X_train.append(text)
            y_train.append(filename[0])
                
        #use the number 10-19 file as training data 
        else:
            X_test.append(text)
            y_test.append(filename[0])
        
# Preprocess the data
X_train_count = []
X_test_count = []
for doc in X_train:
    counts = Counter(doc)
    X_train_count.append([counts[char] for char in vocabulary])
    
for doc in X_test:
    counts = Counter(doc)
    X_test_count.append([counts[char] for char in vocabulary])

# Convert y to integer labels
label_map = {'e': 0, 'j': 1, 's': 2}
y_train = [label_map[label] for label in y_train]
y_test = [label_map[label] for label in y_test]

# Estimate the prior probabilities
prior_probs = np.zeros(num_classes)
for c in range(num_classes):
    prior_probs[c] = (np.sum([1 for label in y_train if label == c]) + alpha) / (len(y_train) + num_classes * alpha)

print("the prior for english is: ", prior_probs[0])
print("the prior for japanese is: ", prior_probs[1])
print("the prior for spanish is: ", prior_probs[2])

the prior for english is:  0.3333333333333333
the prior for japanese is:  0.3333333333333333
the prior for spanish is:  0.3333333333333333


In [2]:
# Estimate the conditional probabilities
cond_probs = np.zeros((num_classes, len(vocabulary)))
for c in range(num_classes):
    X_c = [X_train_count[i] for i in range(len(X_train_count)) if y_train[i] == c]
    total_counts = np.sum(X_c, axis=0)
    denom = np.sum(total_counts) + len(vocabulary) * alpha
    cond_probs[c, :] = (total_counts + alpha) / denom
    
print("the conditional probability for english is: \n", cond_probs[0, :])

the conditional probability for english is: 
 [0.06016851 0.01113497 0.02151    0.02197258 0.10536924 0.01893276
 0.01747894 0.04721626 0.05541054 0.00142078 0.00373369 0.02897737
 0.02051875 0.05792169 0.0644639  0.01675202 0.0005617  0.05382455
 0.06618206 0.08012556 0.02666446 0.00928465 0.01549645 0.00115645
 0.01384437 0.00062779 0.17924996]


In [3]:
print("the conditional probability for japanese is: \n", cond_probs[1, :])
print("the conditional probability for spanish is: \n", cond_probs[2, :])

the conditional probability for japanese is: 
 [1.31765610e-01 1.08669066e-02 5.48586603e-03 1.72263182e-02
 6.02047591e-02 3.87854223e-03 1.40116706e-02 3.17621161e-02
 9.70334393e-02 2.34110207e-03 5.74094133e-02 1.43261470e-03
 3.97987351e-02 5.67105769e-02 9.11632132e-02 8.73545547e-04
 1.04825466e-04 4.28037318e-02 4.21747790e-02 5.69901115e-02
 7.06174220e-02 2.44592753e-04 1.97421294e-02 3.49418219e-05
 1.41514379e-02 7.72214263e-03 1.23449457e-01]
the conditional probability for spanish is: 
 [1.04560451e-01 8.23286362e-03 3.75258241e-02 3.97459221e-02
 1.13810860e-01 8.60287996e-03 7.18448398e-03 4.53270019e-03
 4.98597021e-02 6.62945947e-03 2.77512257e-04 5.29431717e-02
 2.58086399e-02 5.41765595e-02 7.24923684e-02 2.42669051e-02
 7.67783910e-03 5.92951189e-02 6.57704049e-02 3.56140730e-02
 3.37023219e-02 5.88942678e-03 9.25040856e-05 2.49761031e-03
 7.86284728e-03 2.68261848e-03 1.68264932e-01]


In [4]:
x10 = []
y10 = []

with open(os.path.join(dataset_dir, 'e10.txt'), 'r') as f:
    text = f.read().lower()
    x10.append(text)
    y10.append(filename[0])
    
x10_count = []

for doc in x10:
    counts = Counter(doc)
    x10_count.append([counts[char] for char in vocabulary])

y10 = [label_map[label] for label in y10]

print(' the bag-of-words vector x for e10.txt is: \n', x10_count[0])

 the bag-of-words vector x for e10.txt is: 
 [164, 32, 53, 57, 311, 55, 51, 140, 140, 3, 6, 85, 64, 139, 182, 53, 3, 141, 186, 225, 65, 31, 47, 4, 38, 2, 498]


In [13]:
predict_proba = []

log_probs = np.zeros(3)
for c in range(num_classes):
    log_probs[c] = np.sum([np.log(cond_probs[c, j]) * doc[j] for j in range(len(vocabulary))])
predict_proba = log_probs
    
print('log(p(x | y = e)):', predict_proba[0])
print('log(p(x | y = j)):', predict_proba[1])
print('log(p(x | y = s)):', predict_proba[2])

log(p(x | y = e)): -7841.865447060635
log(p(x | y = j)): -8771.433079075032
log(p(x | y = s)): -8467.282044010557


In [14]:
posterior_proba = prior_probs + predict_proba
    
print('log(p(y = e | x)):', posterior_proba[0])
print('log(p(y = j | x)):', posterior_proba[1])
print('log(p(y = s | x)):', posterior_proba[2])

log(p(y = e | x)): -7841.532113727302
log(p(y = j | x)): -8771.099745741698
log(p(y = s | x)): -8466.948710677223


In [27]:
# Classify new documents
y_pred = []
for doc in X_test_count:
    log_probs = np.log(prior_probs)
    for c in range(num_classes):
        log_probs[c] += np.sum([np.log(cond_probs[c, j]) * doc[j] for j in range(len(vocabulary))])
    y_pred.append(np.argmax(log_probs))

print('the predicted results: ', y_pred)
print('the true results: ', y_test)

the predicted results:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
the true results:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
