In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import csv
import math
import pickle
import random
from collections import defaultdict

# **Read Inputs**

In [26]:
# Read user data
lastWord = ""
words = defaultdict(dict)
with open('Data/data.txt', 'r') as input:
    for line in input:
        touchpoints = [0,0, '']
        line.strip
        if line.startswith("==="):
            words[str(line)[3:len(str(line))-4]] = []
            lastWord = str(line)[3:len(str(line))-4]
        else:
            tempPoints = str(line)[0:len(str(line))-2].split(" ")
            touchpoints[0] = float(tempPoints[1])
            touchpoints[1] = float(tempPoints[2])
            touchpoints[2] = tempPoints[0]
            words[lastWord].append(touchpoints)

In [27]:
# Read dictionary
with open('Data/unigram.dict', 'rb') as unigramModelFile:
    unigramModel = pickle.load(unigramModelFile)
unigramModelFile.close()

# Read keyboard data
keyboard_raw = pd.read_csv("Data/keyboard.csv")
keyboard = keyboard_raw[['key', 'x_mm', 'y_mm']]
keyboard

Unnamed: 0,key,x_mm,y_mm
0,a,4.02501,9.625024
1,b,18.900047,13.650034
2,c,12.950032,13.650034
3,d,9.975025,9.625024
4,e,8.487521,5.600014
5,f,12.950032,9.625024
6,g,15.925039,9.625024
7,h,18.900047,9.625024
8,i,23.362558,5.600014
9,j,21.875053,9.625024


# **Unigram Language Model Decoder**

In [28]:
# Keyboard size and dual Gaussian model parameters
key_width = 3
key_height = 4
a = 2.403
b = 0.017
c = 2.295
d = 0.016

def get_likelihood(p, mu, sigma):
    """
    Calculate the likelihood that a touch point p is from the 2D Gaussian distribution N(mu, sigma)
    """  
    lik = a * math.exp(-(math.pow((p[0]-mu[0]),2)/(2*(a+b*math.pow(key_width,2)))+(math.pow(p[1]-mu[1],2)/(2*(c+d*math.pow(key_height,2))))))
    return lik          

def is_letter(p, letter):
    """
    Determine if touch point p is located inside the boundary of the key: letter
    """
    tmp = keyboard.loc[keyboard['key'] == letter]
    x_mm = tmp.iloc[0]['x_mm']
    y_mm = tmp.iloc[0]['y_mm']
    if p[0] < (x_mm + key_width) and p[0] > x_mm and p[1] < (y_mm + key_height) and p[1] > y_mm:
        return True
    return False 

def get_literal_string(touchpoints):
    """ 
    Compute the literal string using is_letter(p, letter) method for a collection of touch points that represents a word. 
          If a touch point does not fall inside any key boundary, use '?' to represent the corresponding character.
    """
    literal_string = ""
    tmp = keyboard['key']
    for touchpoint in touchpoints:
        found = False
        for i in range(25):
            letter = tmp.values[i]
            if is_letter(touchpoint, letter):
                literal_string += letter
                found = True
        if found == False:
            literal_string+= "?"
    return literal_string

In [29]:
def unigram_lm_decoder(touchpoints):
    """
    A language decoder that uses the dual Gaussian touch point spatial disrtibution model and a unigram language model.
    Input: a list/collection of touch points that represents a certain word
    Output: the decoded word for the input
    
    Step a --- Get all possible words and their corresponding probabilities from the dictionary. 
          Use the length of the correct word to filter possible words
          You may also use the first and/or the last touchpoint to further narrow down possible words
    """
    possible_words = []
    length = len(touchpoints)
    firstTP = touchpoints[0]
    lastTP = touchpoints[len(touchpoints)-1]
    possible_words = dict(filter(lambda x: len(str(x[0])) == length, unigramModel.items()))
    #possible_words = dict(filter(lambda x: is_letter(firstTP, x[0][:1]), unigramModel.items()))
    
    # Calculate p(w|s_1, s_2, ... s_n) ~ p(s_1, s_2, ..., s_n|w)*p(w) = \Pi(p(s_i|c_i))p(w) for each possible word
    p_w_s = []                  # Holds p(w|s_1, s_2, ..., s_n) for all possible words
    for item in possible_words:
        word = item                  # The current possible word
        p_w =  possible_words[item]  # Probability of the current possible word in the unigram language model
        p_s_w = 1                 # Holds p(s_1, s_2, ..., s_n|w) for the current possible word

        for j, letter in enumerate(list(word)):
            currentPoints = touchpoints[j]
            # Step b --- Apply the spatial model to get p(s_i|c_i)
            tmp = keyboard.loc[keyboard['key'] == letter]
            x_mm = tmp.iloc[0]['x_mm']
            y_mm = tmp.iloc[0]['y_mm']
            mu = [x_mm + (0.5*key_width), y_mm + (0.5*key_height)] # mu = center of key ci

            sigma =[a+b*math.pow(key_width,2),c+d*math.pow(key_height,2)] # [sigma_X, sigma_Y]

            p_s_c = get_likelihood([currentPoints[0], currentPoints[1]], mu, sigma)

            # Step b --- Multiply the current p(s_i|c_i) to p(s_1, s_2, ..., s_n|W)
            p_s_w *= p_s_c
            #print("current Psw for letter",letter,":", p_s_w)
        
        # Step c --- Calculate p(w|s_1, s_2, ... s_n) from p(s_1, s_2, ..., s_n|w) and p(w). Append the result to list
        p_w_s.append([word,p_w * p_s_w])
        #print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    # Step d ---- Choose word by the maximum of p(w|s_1, s_2, ..., s_n)
    decoded_word = [0,0]
    for i in p_w_s:
        if(i[1] > decoded_word[1]):
            decoded_word = i
    print("decoded word:",decoded_word)
    return decoded_word

In [30]:
decoded_success_count = 0
literal_success_count = 0
decoded_words = []
literal_strings = []
correct_words = []
for word in words:
    touchpoints = words[word]
    """
    Use above methods to compute the correct word, decoded word, and the literal string for each touch point collection
          Append results to the corresponding list
          Update the decoded words/literal strings success count. 
              --- If the decoded word/literal string is the same as correct word, increase 1 to decoded words/literal strings success count
    """
    decoded_words.append(unigram_lm_decoder(touchpoints))
    literal_strings.append(get_literal_string(touchpoints))

    correct_word = ''
    for points in touchpoints:
        correct_word+= points[2]

    correct_words.append(correct_word)

# calculate the success rate for both the decoded words and the literal strings using the docoded word/literal string success count
for i in range(len(decoded_words)):
    if(correct_words[i] == decoded_words[i][0]):
        decoded_success_count+=1
    if(correct_words[i] == literal_strings[i]):
        literal_success_count +=1
# Write to results.txt
with open("results.txt", 'w') as output:
    # The first line: success_rate(decoded_words), success_rate(literal_strings)
    tmp = str(decoded_success_count/len(correct_words)) + "," + str(literal_success_count/len(correct_words)) + "\n"
    output.write(tmp)
    for i in range(len(decoded_words)):
        tmp = str(correct_words[i]) + ", "+ str(decoded_words[i][0])+ ", " + str(literal_strings[i])+ "\n"
        output.write(tmp)
        # Each line after: correct_word, decoded_word, literal_string
    output.close()

decoded word: ['a', 0.014206173329411445]
decoded word: ['problem', 0.000349393826075834]
decoded word: ['with', 0.005784631131583281]
decoded word: ['the', 0.054764326296966405]
decoded word: ['engine', 0.000257729463224217]
decoded word: ['please', 0.002945486717661759]
decoded word: ['provide', 0.00016935573000656184]
decoded word: ['your', 0.003577660256003527]
decoded word: ['date', 0.005067378771462644]
decoded word: ['we', 0.00490150821021408]
decoded word: ['run', 0.00021023232455039467]
decoded word: ['the', 0.05255184998822622]
decoded word: ['risk', 9.58565929246257e-05]
decoded word: ['of', 0.014930354593316276]
decoded word: ['failure', 0.00020223267657934883]
decoded word: ['my', 0.0024831199889199466]
decoded word: ['favorite', 0.00022005881555797779]
decoded word: ['place', 0.000537721797096096]
decoded word: ['to', 0.015444652579832812]
decoded word: ['visit', 0.0015833389170577716]
decoded word: ['circumstances', 0.00012797950738618208]
decoded word: ['are', 0.0244440