In [36]:

#reading file

def read_labeled_file(filename):
    '''
    Read an apropriate file.
    
    Takes the path to file
    returns a list of (word, tag) tuples
    '''
    result = []
    singletweet = []
    with open(filename, "r") as f:
        for line in f:
            if line == "\n":
                result.append(singletweet)
                singletweet = []
            else:
                linelist = line.strip("\n").split(" ")
                singletweet.append(tuple(linelist))
    return result



def estimate_emission_param(data):
    '''
    Takes a list of (word, tag) tuple.
    returns:
        - iterable of all available words
        - iterable of all available tag
        - dictionary of emission parameter 
          with key <word, tag>
    '''
    tag_to_word_count = {}
    word_count = {}
    tag_count = {}
    
    for tweet in data: 
        for tagged_word in tweet:
            # loops through the data and get respective counts
            word = tagged_word[0]
            tag = tagged_word[1]
            
            #incrementing counts
            word_count[word] = word_count.get(word, 0) + 1
            tag_count[tag] = tag_count.get(tag, 0) + 1
            tag_to_word_count[tagged_word] = tag_to_word_count\
                                              .get(tagged_word, 0) + 1
                
    # once count is settled, we can get emission parameter
    emission_parameter = {k: tag_to_word_count[k]/tag_count[k[1]] 
                          for k in tag_to_word_count}
    
    return word_count.keys(), tag_count.keys(), emission_parameter
def supress_infrequent_words(data, k=3):
    '''
    Takes a list of (word, tag) tuple
    returns a new list with infrequent
    words replaced with #UNK#
    
    k = number of occurence that is 
        considered to be known
    '''
    word_count = {}

    #get word count
    for tweet in data:
        for tagged_word in tweet:
            word = tagged_word[0]
            word_count[word] = word_count.get(word, 0) + 1
                
    #generate new list
    result = []
    newtweet = []
    for tweet in data:
        for tagged_word in tweet:
            word = tagged_word[0]
            if word_count[word] >= k:
                newtweet.append(tagged_word)
            else:
                tag = tagged_word[1]
                newtweet.append(("#UNK#",tag))
        result.append(newtweet)
        newtweet = []
        
    return result
def single_sentiment_analysis(tags, param, word):
    '''
    Takes:
        - a list of of discovered tags
        - a dictionary for emission parameter
        - the word to be tagged
    return:
        - a tuple of (word, predicted_tag)
    '''
    
    mle = (word, "O") #assuming tag O for undiscovered word
    mle_value = 0
    for t in tags:
        if (word, t) in param:
            if param[(word, t)] > mle_value:
                mle = (word, t)
                mle_value = param[(word, t)]
    return mle
def write_simple_prediction(country, part, words, tags, param):
    '''
    takes:
        - countri string ("CN","EN" etc)
        - part string (for question part 1, part 2 etc)
        - a list of discovered words
        - a list of discovered tags
        - a dictionary of emission parameter
    '''
    input_filename = country + "/dev.in"
    output_filename = country + "/dev.p"+part+".out"
    with open(input_filename, "r") as inputfile:
        with open(output_filename, "w") as outputfile:
            for line in inputfile:
                if line =="\n":
                    outputfile.write("\n")
                    continue
                if line.strip("\n") in words:
                    pred = single_sentiment_analysis(tags, param, line.strip("\n"))
                    outputfile.write(" ".join(pred)+"\n")
                else:
                    outputfile.write("#UNK# O\n")

from datetime import datetime
# now we do it for all 4 countries
# recording timing
for c in ["CN (1)", "EN", "SG(1)", "FR"]:
    start = datetime.now()
    data = read_labeled_file(c+"/train")
    sdata = supress_infrequent_words(data)
    words, tags, em_param = estimate_emission_param(data)
    write_simple_prediction(c,"2",
                    words, tags, em_param)
    end = datetime.now()
    delt = end - start
    print("{} part 2 done in {}.{}s"\
          .format(c, delt.seconds, delt.microseconds))


            
    

CN (1) part 2 done in 0.425881s
EN part 2 done in 0.46306s
SG(1) part 2 done in 1.285997s
FR part 2 done in 0.159711s


In [39]:
def create_transition_parameter(data):
    tag_transition_count={}
    #(tag0,tag1):count
    tag_count = {}
    #tag0:count
    for tweet in data:
        for i in range(len(tweet)-1):
            tag1 = tweet[i][1]
            if i ==0:
                tag0 = "START"
            else:
                tag0 = tweet[i-1][1]
            tag_count[tag0]=tag_count.get(tag0,0)+1
            tag_transition=(tag0,tag1)
            tag_transition_count[tag_transition] = tag_transition_count.get(tag_transition,0)+1
            if i == len(tweet)-1:
                tag0 = tag[1]
                tag[1] = "STOP"
                tag_count[tag0]=tag_count.get(tag0,0)+1
                tag_transition=(tag0,tag1)
                tag_transition_count[tag_transition] = tag_transition_count.get(tag_transition,0)+1
    transition_parameter = {k: tag_transition_count[k]/tag_count[k[0]]
                        for k in tag_transition_count}
    return transition_parameter


In [90]:
# takes in emmision para & transition para & file
# predict tag seq
# obs-words
# states-tags
#start-word/all start-waord ---start_p* word in tag/word in all tag-----emmision_p

def create_start_parameter(data):
    start_tag_count={}
    total_tweet_count =0
    #tag:count
    for tweet in data:
        start_tag = tweet[0][1]
        start_tag_count[start_tag]=start_tag_count.get(start_tag,0)+1
        total_tweet_count += 1
    start_parameter = {k: start_tag_count[k]/total_tweet_count
                        for k in start_tag_count}
    return start_parameter

    
        
def viterbi(words, tags, start_p, trans_p, emit_p):
    V = [{}]
    path = {}
    # Initialize base cases (t == 0)
    for y in tags:
        V[0][y] = start_p.get(y,0) * emit_p.get((words[0],y),0)
        path[y] = [y]
    # Run Viterbi for t > 0
    for t in range(1,len(words)):
        V.append({})
        newpath = {}

        for y in tags:
            (prob, tag) = max([(V[t-1][y0] * trans_p.get((y0,y),0) * emit_p.get((words[t],y),0), y0) 
                               for y0 in tags])
            V[t][y] = prob
            newpath[y] = path[tag] + [y]

        # Don't need to remember the old paths
        path = newpath
    (prob, tag) = max([(V[len(words) - 1][y], y) for y in tags])
    return (prob, path[tag])




def write_hmm_prediction(country, part, prediction_function,
                         word_sequence,tags, start_param, emit_param, trans_param):
    '''
    Function to write HMM prediction
    '''
    input_filename = country + "/dev.in"
    output_filename = country + "/dev.p"+part+".out"
    indata = []
    #read and separate tweets
    with open(input_filename, "r") as infile:
        indata = infile.read().strip('\n').split('\n\n') 
    
    with open(output_filename, "w") as outfile:
        for tweet in indata:
            word_sequence = tweet.split('\n')
            predicted_tag_sequence = prediction_function(word_sequence,
                                                tags, start_param, trans_param, emit_param)[1]
            if len(word_sequence) != len(predicted_tag_sequence):
                print("WARNING!! Different length {} / {}"\
                      .format(word_sequence, predicted_tag_sequence))
            for i in range(len(word_sequence)):
                line = "{} {}\n".format(word_sequence[i], 
                                        predicted_tag_sequence[i])
                outfile.write(line)
            outfile.write("\n")


from datetime import datetime
for c in ["CN (1)", "EN", "SG(1)", "FR"]:
    start = datetime.now()
    data = read_labeled_file(c+"/train")
    supress_data = supress_infrequent_words(data)
    words, tags, emit_param = estimate_emission_param(supress_data)
    trans_param = create_transition_parameter(supress_data)
    start_param = create_start_parameter(supress_data)
    write_hmm_prediction(c,"3", viterbi,
                        words, tags, start_param, emit_param, trans_param)
    end = datetime.now()
    delt = end - start
    print("{} part 3 done in {}.{}s"\
          .format(c, delt.seconds, delt.microseconds))








CN (1) part 3 done in 1.282776s
EN part 3 done in 0.338971s
SG(1) part 3 done in 5.590018s
FR part 3 done in 0.267118s
