**Kaggle Group: Tim Hortons**

*   Jiayi Bao, jb2578
*   Siqi Dai, sd854
*   Guanyunbo Yang, gy92

# Model 1: Hidden Markov Model (HMM)

### <font color='navy'> Mount Google Drive and Retrieve Dataset </font>

In [0]:
# mount google drive
from google.colab import drive
drive.mount("/content/gdrive")
dataset = "/content/gdrive/My Drive/P2/data_release/"  # path of the dataset folder
train = dataset + 'train.csv'  # training set
validation = dataset + 'val.csv'  # validation set
test = dataset+ 'test_no_label.csv'  # test set

Mounted at /content/gdrive


### <font color='navy'> Data Preprocessing

In [0]:
import csv
def readCSV(fn):
  sentences, tags, labels = [], [], []
  with open(fn, encoding='latin-1') as f:
    readCSV = csv.reader(f, delimiter=',')
    for row in readCSV:
      if row[0] == 'sentence': continue  # ignore header line
      sentence, tag, label = row[0], row[1], row[2]
     
      sentence = sentence.lower()  # normalization
      sentence = sentence.split()
      #sentence.insert(0, '<s>')  # add start sign at the beginning of a sentence
      sentences += sentence
      
      tag = tag[1:-1]
      tag = tag.replace("'", "")
      tag = tag.split(", ")
      #tag = ['START'] + tag.split(", ") # add start symbol at the beginning
      tags += tag
      
      label = label[1:-1]
      label = label.split(", ")
      #label = ['-1'] + label.split(", ")  # add the label -1 for '<s>' at the beginning
      labels += label
  return [sentences, tags, labels]
  
train_set = readCSV(train)

### <font color='navy'> Implement HMM

In [0]:
import collections
import math

# compute P(x2|x1)
def computeBigram(unigram, bigram, x1, x2):
  p_x1x2 = bigram[(x1, x2)]  # compute P(x1, x2)
  p_x1 = unigram[x1]  # compute P(x1)
  return (p_x1x2 + 1)/ (p_x1 + len(unigram))  # add-one smoothing


# implement HMM using viterbi
def HMM(hidden_unigram, hidden_bigram, visible_unigram, combined_unigram, to_predict):
  c = 2
  n = len(to_predict)
  SCORE = [[0] * n for _ in range(c)]
  BPTR = [[0] * n for _ in range(c)]
  # initialization
  for i in range(c):
    P1 = hidden_unigram[str(i)]  # P(ti)
    P2 = (combined_unigram[(to_predict[0], str(i))] + 1) / hidden_unigram[str(i)]  # P(w1|ti)
    SCORE[i][0] = P1 * P2

  for i in range(1, n):
    for j in range(c):
      maxScore = 0
      maxIdx = -1
      for m in range(c):
        P1 = hidden_bigram[(str(m), str(j))] / hidden_unigram[str(m)]  # P(tj|tm)
        P2 = (combined_unigram[(to_predict[i], str(j))] + 1) / hidden_unigram[str(j)]  # P(wi|tj)
        s = SCORE[m][i-1] * P1 * P2
        if s > maxScore:
          maxScore = s
          maxIdx = m
      SCORE[j][i] = maxScore
      BPTR[j][i] = maxIdx
  # identify sequence
  T = [0] * n
  idx = 0 if SCORE[0][n-1] > SCORE[1][n-1] else 1
  T[-1] = idx
  for i in range(n-2, -1, -1):
    T[i] = BPTR[T[i+1]][i+1]
  return T
  

sentences, tags, labels = train_set[0], train_set[1], train_set[2]
visible_val = sentences
combined = [(sentences[i], labels[i]) for i in range(len(labels))]

visible_unigram = collections.Counter(visible_val)
hidden_unigram = collections.Counter(labels)  # unigram for labels
hidden_bigram = collections.Counter(zip(labels,labels[1:]))  # bigram for labels
combined_unigram = collections.Counter(combined) 

### <font color='navy'> Validation

In [0]:

# s = ['four', 'alternative', 'approaches', 'have', 'been', 'described', ',', 'and', 'many', 'others', 'could', 'be', 'listed', '.']
# t = HMM(hidden_unigram, hidden_bigram, visible_unigram, combined_unigram, s)
# print(t)

res = []
with open(validation, encoding='latin-1') as f:
    readCSV = csv.reader(f, delimiter=',')
    for row in readCSV:
      if row[0] == 'sentence': continue  # ignore header line
      sentence, tag, label = row[0], row[1], row[2]
      sentence = sentence.lower()  # normalization
      sentence = sentence.split()
      
      tag = tag[1:-1]
      tag = tag.replace("'", "")
      tag = tag.split(", ")

      to_predict = sentence
      predict_label = HMM(hidden_unigram, hidden_bigram, visible_unigram, combined_unigram, to_predict)
      res += predict_label
      #print(predict_label)
      label = label[1:-1]
      true_label = label.split(", ")
     

### <font color='navy'> Output Result to CSV

In [0]:
def outputCSV(fn, res):
  rows = [['idx', 'label']] + [[i+1, res[i]] for i in range(len(res))]
  with open(fn, 'w') as f:
    writer = csv.writer(f)
    writer.writerows(rows)
    
    
pred = []
with open(test, encoding='latin-1') as f:
    readCSV = csv.reader(f, delimiter=',')
    for row in readCSV:
      if row[0] == 'sentence': continue  # ignore header line
      sentence, tag = row[0], row[1]
      sentence = sentence.lower()  # normalization
      sentence = sentence.split()
      
      tag = tag[1:-1]
      tag = tag.replace("'", "")
      tag = tag.split(", ")
      
      to_predict = sentence
      predict_label = HMM(hidden_unigram, hidden_bigram, visible_unigram, combined_unigram, to_predict)
      pred += predict_label
      
outputCSV('output.csv', pred)
outputCSV('validation.csv', res)

# Model 2

### <font color='navy'> Feature Engineering

In [0]:
!pip install spacy
!python -m spacy download en_core_web_md

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [0]:
import numpy as np
import pandas as pd
import spacy
import en_core_web_md
nlp = en_core_web_md.load()

#### Extract labels from dataset

In [0]:
# retrieve labels for all word tokens and save as integer list from a dataframe
def retrieve_label(df):
    labels = []
    for row in df["label_seq"]:
        sen_labels = []
        for s in row:
            if s == '0':
                sen_labels.append(0)
            elif s == '1':
                sen_labels.append(1)
        labels.append(sen_labels)
    # flatten
    label_flatten = [item for l in labels for item in l]
    print("Dataset has", len(label_flatten), "labels")
    return np.asarray(label_flatten) 

#### Helper function to extract Tags as Categorical featrues

In [0]:
#encode pos (string -> categorical)
from sklearn.preprocessing import LabelEncoder
def encode_pos(df):
    pos = []
    pos_lists_str = []
    
    # clean up 
    for row in df["pos_seq"]:
        tag_list = row.replace("'","").replace("[","").replace("]","").split(', ')
        [pos.append(tag) for tag in tag_list]
        pos_lists_str.append(tag_list)
    pos_cat = np.unique(pos)
    
    # string -> category
    pos_lists_cat = []
    for pos_list in pos_lists_str:
        tag_cat = []
        for pos in pos_list:
            position = int(np.where(pos_cat == pos)[0])
            tag_cat.append(position)
        pos_lists_cat.append(tag_cat)
    return pos_lists_cat

#### Extract featrues from dataset


In [0]:
# Feature Engineering   ["hey","ha","hu"]
def retrieve_features(df):
    
    X = []
    
    # retrieve pos as categorical feature
    pos_tags = encode_pos(df)

    # process each sentence in the dataset
    for i, df_tuple in df.iterrows():
    
        row = df_tuple["sentence"]
        tag_list = pos_tags[i]
        
        sentence = row.split(" ")
        sentence.insert(0," ")   # add dummy words to the front and end of sentence
        sentence.append(" ")
        
        # word -> vector 
        vector_list = []
        for index in range(len(sentence)):
            vector_list.append(nlp.vocab.get_vector(sentence[index]))
        
        # extract features
        for index in range(1, len(sentence)-1):
            
            features = []  #feature vector for one word
            word = sentence[index]
            
            # extract features -- window = 3 
            current_vec = vector_list[index]
            pre_vec = vector_list[index-1]
            post_vec = vector_list[index+1]
            
            # extarct features -- contains number?
            has_number = [any(char.isdigit() for char in word)]
            
            # extract features -- contain hyphen ?
            has_hyphen = [any(char == "-" for char in word)]
            
            # extract features -- contain capital letter ?
            has_capital = [any(char.isupper() for char in word)]
            
            # extract features -- current tag
            curr_tag = [tag_list[index-1]]
            pre_tag = [tag_list[index-2]] if index != 1 else [0]
            post_tag = [tag_list[index]] if index != len(sentence)-2 else [16]
            # print( pre_tag + curr_tag + post_tag)
            
            # combine all features
            features = np.concatenate((current_vec, pre_vec, post_vec, has_number, has_hyphen, has_capital, curr_tag, pre_tag, post_tag))
            X.append(features)
            
    print("Extracted",len(features),"features for all",len(X)," words")
    return np.asarray(X)

In [0]:
#df = pd.read_csv('train.csv', engine='python')
df = pd.read_csv(train, engine='python')
X = retrieve_features(df)
Y = retrieve_label(df)
X.shape

Extracted 906 features for all 116622  words
Dataset has 116622 labels


(116622, 906)

### <font color='navy'> Training

In [0]:
from sklearn import preprocessing

feature_set = list(zip(train_set[0], train_set[1]))
print(feature_set)



In [0]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

gnb = GaussianNB() #Gaussian Naive Bayes classifier
maxent = LogisticRegression(random_state=0, solver='lbfgs',max_iter=1000, multi_class='multinomial') #Logistic Regression/MaxEnt classifier
maxent.fit(X, Y)
gnb.fit(X, Y)



TypeError: ignored

### <font color='navy'> Viterbi

In [0]:
# implement viterbi with classifier
def Viterbi_with_classifier(hidden_unigram, hidden_bigram, visible_unigram, combined_unigram, classifier, to_predict, startIdx, featuresets):
  c = 2
  n = len(to_predict)
  SCORE = [[0] * n for _ in range(c)]
  BPTR = [[0] * n for _ in range(c)]
  # initialization
  for i in range(c):
    P1 = hidden_unigram[str(i)]  # P(ti)
    P2 = (combined_unigram[(to_predict[0], str(i))] + 1) / hidden_unigram[str(i)]  # P(w1|ti)
    # replace either emission/transition probability or both with P(ti|feature)
    #SCORE[i][0] = classifier.predict_proba([featuresets[startIdx + 0]])[0][i] 
    #SCORE[i][0] = P1 * classifier.predict_proba([featuresets[startIdx + 0]])[0][i] 
    SCORE[i][0] = P2 * classifier.predict_proba([featuresets[startIdx + 0]])[0][i] 

  for i in range(1, n):
    for j in range(c):
      maxScore = 0
      maxIdx = 0
      for m in range(c):
        P1 = hidden_bigram[(str(m), str(j))] / hidden_unigram[str(m)]  # P(tj|tm)
        P2 = (combined_unigram[(to_predict[i], str(j))] + 1) / hidden_unigram[str(j)]  # P(wi|tj)
        # replace either emission/transition probability or both with P(ti|feature)
        #s = SCORE[m][i-1] * classifier.predict_proba([featuresets[startIdx + i]])[0][m]
        #s = SCORE[m][i-1] * P1 * classifier.predict_proba([featuresets[startIdx + i]])[0][m]
        s = SCORE[m][i-1] * P2 * classifier.predict_proba([featuresets[startIdx + i]])[0][m]
        if s > maxScore:
          maxScore = s
          maxIdx = m
      SCORE[j][i] = maxScore
      BPTR[j][i] = maxIdx
  # identify sequence
  T = [0] * n
  idx = 0 if SCORE[0][n-1] > SCORE[1][n-1] else 1
  T[-1] = idx
  for i in range(n-2, -1, -1):
    T[i] = BPTR[T[i+1]][i+1]
    if T[i] == -1:
      print(sentence[i])
  return T




### <font color='navy'> Validation

In [0]:
df_val = pd.read_csv(validation, engine='python')
valX = retrieve_features(df_val)
res = []
startIdx = 0
with open(validation, encoding='latin-1') as f:
    readCSV = csv.reader(f, delimiter=',')
    for row in readCSV:
      if row[0] == 'sentence': continue  # ignore header line
      sentence, tag, label = row[0], row[1], row[2]
      sentence = sentence.lower()  # normalization
      sentence = sentence.split()
      
      tag = tag[1:-1]
      tag = tag.replace("'", "")
      tag = tag.split(", ")

      to_predict = sentence
      predict_label = Viterbi_with_classifier(hidden_unigram, hidden_bigram, visible_unigram, combined_unigram, maxent, sentence, startIdx, valX)
      startIdx += len(sentence)
      res += predict_label
      #print(predict_label)
      label = label[1:-1]
      true_label = label.split(", ")
    
outputCSV('validation3.csv', res)

Extracted 906 features for all 38628  words


### <font color='navy'> Output Result to CSV

In [0]:
def outputCSV(fn, res):
  rows = [['idx', 'label']] + [[i+1, res[i]] for i in range(len(res))]
  with open(fn, 'w') as f:
    writer = csv.writer(f)
    writer.writerows(rows)
    
df_test = pd.read_csv(test, engine='python')
testX = retrieve_features(df_test)
res = []
startIdx = 0
pred = []
with open(test, encoding='latin-1') as f:
    readCSV = csv.reader(f, delimiter=',')
    for row in readCSV:
      if row[0] == 'sentence': continue  # ignore header line
      sentence, tag = row[0], row[1]
      sentence = sentence.lower()  # normalization
      sentence = sentence.split()
      
      tag = tag[1:-1]
      tag = tag.replace("'", "")
      tag = tag.split(", ")
      
      to_predict = sentence
      predict_label = Viterbi_with_classifier(hidden_unigram, hidden_bigram, visible_unigram, combined_unigram, maxent, sentence, startIdx, testX)
      startIdx += len(sentence)
      pred += predict_label

#outputCSV('validation2.csv', res)
outputCSV('output_model2.csv', pred)
#print(pred)

Extracted 906 features for all 50175  words
