In [1]:
import sys
import math
import string

In [2]:
def get_parameter_vectors():
    '''
    This function parses e.txt and s.txt to get the  26-dimensional multinomial
    parameter vector (characters probabilities of English and Spanish) as
    descibed in section 1.2 of the writeup

    Returns: tuple of vectors e and s
    '''
    #Implementing vectors e,s as lists (arrays) of length 26
    #with p[0] being the probability of 'A' and so on
    e=[0]*26
    s=[0]*26

    with open('e.txt',encoding='utf-8') as f:
        for line in f:
            #strip: removes the newline character
            #split: split the string on space character
            char,prob=line.strip().split(" ")
            #ord('E') gives the ASCII (integer) value of character 'E'
            #we then subtract it from 'A' to give array index
            #This way 'A' gets index 0 and 'Z' gets index 25.
            e[ord(char)-ord('A')]=float(prob)
    f.close()

    with open('s.txt',encoding='utf-8') as f:
        for line in f:
            char,prob=line.strip().split(" ")
            s[ord(char)-ord('A')]=float(prob)
    f.close()

    return (e,s)

In [3]:
def shred(filename):
    #Using a dictionary here. You may change this to any data structure of
    #your choice such as lists (X=[]) etc. for the assignment
    X=dict()
    with open (filename,encoding='utf-8') as f:
        data = f.read()

    data = data.upper()
    UpAlpha = set(string.ascii_uppercase)

    for character in UpAlpha:
        X[character] = 0
        
    for character in data:
        if character in UpAlpha:
            X[character] += 1

    return X

In [4]:
#Q1 output
print("Q1")
X = shred("samples/letter0.txt")
for character, count in sorted(X.items()):
    print(f"{character} {count}")

Q1
A 4
B 2
C 2
D 2
E 5
F 2
G 1
H 2
I 7
J 0
K 0
L 4
M 0
N 5
O 3
P 1
Q 0
R 2
S 3
T 4
U 1
V 0
W 0
X 0
Y 1
Z 0


In [5]:
#Q2
e,s = get_parameter_vectors()
a = sorted(X.items())[0][1] * math.log(e[0])
b = sorted(X.items())[0][1] * math.log(s[0])
print("Q2")
print(float("{:.4f}".format(a)))
print(float("{:.4f}".format(b)))

Q2
-9.9344
-8.4265


In [6]:
#Q3
PYE = 0.6
PYS = 0.4
X = sorted(X.items())
FYE = math.log(PYE) + sum(X[i][1] * math.log(e[i]) for i in range(25))
FYS = math.log(PYS) + sum(X[i][1] * math.log(s[i]) for i in range(25))
print("Q3")
print(float("{:.4f}".format(FYE)))
print(float("{:.4f}".format(FYS)))

Q3
-147.8619
-152.4015


In [7]:
#Q4
def get_prob(FYE, FYS):
    if FYS - FYE >= 100:
        return 0
    elif FYS - FYE <= -100:
        return 1
    else:
        return 1 / (1 + math.exp(FYS - FYE))
print("Q4")
print(get_prob(FYE, FYS))

Q4
0.989434609215349
