In [34]:
#Just like in P2(a), perform POS Tagging on the Brown corpus. (Like before, train your Logistic Regression model on the
#tagged corpus, and test on the untagged one). 
#Use one vs all logistic regression to perform this exercise. 
#Essentially, given a word, try to classify it with classifiers trained for all pos tags and get most probable one.
#Do NOT use any ML libraries like scipy for coding up the logistic regression. NLTK maybe allowed, but only for 
#getting corpus.
!pip install -U gensim
!pip install nltk
!pip install Cython
!pip install fasttext 
import numpy as np
import cPickle as pickle
import nltk
nltk.download('brown')
from nltk.corpus import brown
import os
from copy import deepcopy
from collections import Counter
from collections import defaultdict
import gensim.downloader as api
from gensim.models import FastText
model = FastText.load_fasttext_format("wiki.en.bin")
script_path=os.path.dirname(os.path.realpath('__file__'))+"/saved_models"

Requirement already up-to-date: gensim in ./py3_env/lib/python3.5/site-packages (3.6.0)
[nltk_data] Downloading package brown to
[nltk_data]     /home/priyansh.agrawal/nltk_data...
[nltk_data]   Package brown is already up-to-date!
/home/priyansh.agrawal/saved_models


In [35]:

#one way is using class (optional), advantage being you can create multiple instances of class and train for each pos tag

class logistic_regression:
    def __init__(self):
        self.W = None # set up the weight matrix 

    def train(self, X, y,learning_rate=1e-4,reg = 1e3,num_iters=1000,softmax=False):
        dim, num_train = X.shape
        num_classes = np.max(y) + 1 # assume y takes values 0...K-1 where K is number of classes
        if self.W is None:
            # initialize the weights with small values
            if num_classes == 2: # just need weights for one class
                self.W = np.random.randn(1, dim) * 0.001
            else: # weigths for each class
                self.W = np.random.randn(num_classes, dim) * 0.001  
        losses_history = [] #(list) of losses at each training iteration
        for i in xrange(num_iters):
            loss, grad = self.loss_grad(X, y, reg, softmax) # grad => [K x D]
            losses_history.append(loss)
            # update weights
            self.W -= learning_rate * grad # [K x D]
            ite = num_iters/10
            if (i % ite == 0):
                print 'iteration %d/%d: loss %f' % (i, num_iters, loss)
        return losses_history

    def predict(self, X,softmax=False):
        pred_ys = np.zeros(X.shape[1])
        f_x_mat = self.W.dot(X)
        if not softmax:
            pred_ys = f_x_mat.squeeze() >=0
        else: # use softamx for multiclassification
            pred_ys = np.argmax(f_x_mat, axis=0) #(N, ) 1-dimension array of y for N sampels
        # normalized score
        h_x_mat = 1.0 / (1.0 + np.exp(-f_x_mat)) # [1, N]
        h_x_mat = h_x_mat.squeeze() #Normalized scores
        return pred_ys, h_x_mat
    
    def loss_grad_softmax(self,X, y,reg):
        """ Compute the loss and gradients using softmax with vectorized version"""
        loss = 0 
        grad = np.zeros_like(self.W)
        dim, num_train = X.shape
        scores = self.W.dot(X) # [K, N]
        # Shift scores so that the highest value is 0
        scores -= np.max(scores)
        scores_exp = np.exp(scores)
        correct_scores_exp = scores_exp[y, xrange(num_train)] # [N, ]
        scores_exp_sum = np.sum(scores_exp, axis=0) # [N, ]
        loss = -np.sum(np.log(correct_scores_exp / scores_exp_sum))
        loss /= num_train
        loss+= 0.5 * reg * np.sum(self.W * self.W)
        scores_exp_normalized = scores_exp / scores_exp_sum
        # deal with the correct class
        scores_exp_normalized[y, xrange(num_train)] -= 1 # [K, N]
        grad = scores_exp_normalized.dot(X.T)
        grad /= num_train
        grad += reg * self.W
        return loss, grad

    def loss_grad_logistic(self,X, y, reg):
        """Compute the loss and gradients with weights, vectorized version"""
        dim, num_train = X.shape
        loss = 0
        grad = np.zeros_like(self.W) # [1, D]
        f_x_mat = self.W.dot(X) # [1, D] * [D, N]
        h_x_mat = 1.0 / (1.0 + np.exp(-f_x_mat)) # [1, N]
        loss = np.sum(y * np.log(h_x_mat) + (1 - y) * np.log(1 - h_x_mat))
        loss = -1.0 / num_train * loss + 0.5 * reg * np.sum(self.W * self.W)
        grad = (h_x_mat - y).dot(X.T) # [1, D]
        grad = 1.0 / num_train * grad + reg *self.W
        return loss, grad

    def loss_grad(self, X, y, reg,softmax):
        #loss: (float)
        #grad: (array) with respect to self.W
        if not softmax:
            return self.loss_grad_logistic(X, y, reg)
        else:
            return self.loss_grad_softmax(X, y, reg)

In [36]:
def predict_one_vs_all(logistic_classifiers, X, num_classes):
    scores = np.zeros((num_classes, X.shape[1]))
    for i in xrange(num_classes):
        logistic = logistic_classifiers[i]
        scores[i,:] = logistic.predict(X)[1]
    pred_X = np.argmax(scores, axis=0)
    return pred_X

def train_one_vs_all(X_train,y_train,learning_rate,reg,num_iters):
    logistic_classifiers = []
    num_classes = np.max(y_train) + 1
    losses = []
    for i in xrange(num_classes):
        print '\nThe %d/%dth logistic classifier training...' % (i+1, num_classes)
        y_train_logistic = deepcopy(y_train)
        for j in range(len(y_train_logistic)):
            if y_train_logistic[j]!=i:
                y_train_logistic[j]=0
            else:
                y_train_logistic[j]=1
        #y_train_logistic[y_train_logistic!=i]=0
        #y_train_logistic[y_train_logistic==i]=1
        logistic = logistic_regression()
        loss = logistic.train(X_train, y_train_logistic,learning_rate,reg,num_iters)
        losses.append(loss)
        logistic_classifiers.append(logistic)
    return logistic_classifiers    

Task 2 : Predict tag sequence and get accuracy

In [37]:
def pred_tag_sequence(sentence,ft_model,model,class_map,model_name="logi"):
    # Given a sentence, Get sequence of tags for it by getting most prefered tag for each word given  
    # Feel free to add helper functions more features ( like say trigram w1w2w3 as features for w2)
    ##assuming sentence is list of tokens
    num_classes = len(class_map)
    X = word_to_vec(ft_model,sentence[0])
    for i in range(1,len(sentence)):
        X=np.vstack((X,word_to_vec(ft_model,sentence[i])))
    mean_image = np.mean(X, axis = 0)
    X -= mean_image
    X = np.hstack([X, np.ones((X.shape[0], 1))]).T
    if model_name!="logi":
        result= model.predict(X,softmax=True)
        y_pred = result[0]
        score = result[1]
    else:
        y_pred= predict_one_vs_all(model, X, num_classes)
    print_tags(class_map,y_pred)

In [38]:
def print_tags(class_map,y_pred):
    tags=[]
    for ind in y_pred:
        for tag,idx in class_map.iteritems():
            if idx==ind:
                tags.append(tag)
                break
    print "Predicted Tags: ",tags            

In [39]:
def word_to_vec(model,word):
    return model.wv[word]

def extract_data(model,class_map):
    y=np.array([],dtype=int)
    co_vocab=[]
    fl=1
    for sent in brown.tagged_sents():
        for (word,tag) in sent:
            if not fl:
                if (word,tag[:2]) not in co_vocab:
                    co_vocab.append((word,tag[:2]))
                    try:
                        x_i = word_to_vec(model,word)
                    except KeyError:
                        print "Ignoring OOV words..."
                        print word
                        continue
                    temp = np.vstack((temp,x_i))
                    y = np.append(y,int(class_map[tag[:2]]))
            else:
                temp = word_to_vec(model,word)
                y=np.append(y,int(class_map[tag[:2]]))
                fl=0
    X=temp.T
    return X,y

def preprocessing(X_train,X_val,X_test):
    # Normalize the data: subtract the mean image
    X_train=X_train.T
    X_test=X_test.T
    X_val=X_val.T
    mean_image = np.mean(X_train, axis = 0)
    X_train -= mean_image
    X_val -= mean_image
    X_test -= mean_image
    
    # Add bias dimension and transform into columns
    X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))]).T
    X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))]).T
    X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))]).T
    return X_train,X_val,X_test

In [40]:
classes = [str(tag[:2]) for sent in brown.tagged_sents() for (word,tag) in sent]
classes = set(classes)
l=0
r=len(classes)
class_map = dict(zip(classes,xrange(l,r)))
'''X,y = extract_data(model,class_map)
train_test_split=60
total_sent=len(brown.tagged_sents())
train_sent = int(train_test_split*total_sent/100)
X_train,y_train = X[0:X.shape[0],0:train_sent],y[0:train_sent]
X_test,y_test = X[0:X.shape[0],train_sent:train_sent + int((total_sent-train_sent)/2)],y[train_sent:train_sent + int((total_sent-train_sent)/2)]
X_val,y_val = X[0:X.shape[0],train_sent + int((total_sent-train_sent)/2):total_sent],y[train_sent + int((total_sent-train_sent)/2):total_sent]
X_train,X_val,X_test = preprocessing(X_train,X_val,X_test)
print 'Train data shape: ', X_train.shape
print 'Train labels shape: ', y_train.shape
print 'Validation data shape: ', X_val.shape
print 'Validation labels shape: ', y_val.shape
print 'Test data shape: ', X_test.shape
print 'Test labels shape: ', y_test.shape
data=[X_train,y_train,X_val,y_val,X_test,y_test]
save_data(script_path,data)'''

"X,y = extract_data(model,class_map)\ntrain_test_split=60\ntotal_sent=len(brown.tagged_sents())\ntrain_sent = int(train_test_split*total_sent/100)\nX_train,y_train = X[0:X.shape[0],0:train_sent],y[0:train_sent]\nX_test,y_test = X[0:X.shape[0],train_sent:train_sent + int((total_sent-train_sent)/2)],y[train_sent:train_sent + int((total_sent-train_sent)/2)]\nX_val,y_val = X[0:X.shape[0],train_sent + int((total_sent-train_sent)/2):total_sent],y[train_sent + int((total_sent-train_sent)/2):total_sent]\nX_train,X_val,X_test = preprocessing(X_train,X_val,X_test)\nprint 'Train data shape: ', X_train.shape\nprint 'Train labels shape: ', y_train.shape\nprint 'Validation data shape: ', X_val.shape\nprint 'Validation labels shape: ', y_val.shape\nprint 'Test data shape: ', X_test.shape\nprint 'Test labels shape: ', y_test.shape\ndata=[X_train,y_train,X_val,y_val,X_test,y_test]\nsave_data(script_path,data)"

In [41]:
def save_data(script_path,data):
    with open(script_path+"/X_train.pickle",'wb') as fl:
        pickle.dump(data[0],fl)
    fl.close()
    with open(script_path+"/y_train.pickle",'wb') as fl:
        pickle.dump(data[1],fl)
    fl.close()
    with open(script_path+"/X_val.pickle",'wb') as fl:
        pickle.dump(data[2],fl)
    fl.close()
    with open(script_path+"/y_val.pickle",'wb') as fl:
        pickle.dump(data[3],fl)
    fl.close()
    with open(script_path+"/X_test.pickle",'wb') as fl:
        pickle.dump(data[4],fl)
    fl.close()
    with open(script_path+"/y_test.pickle",'wb') as fl:
        pickle.dump(data[5],fl)
    fl.close()
    
def load_data(script_path):
    fl = open(script_path+"/X_train.pickle",'rb')
    X_train = pickle.load(fl)
    fl.close()
    fl = open(script_path+"/y_train.pickle",'rb')
    y_train = pickle.load(fl)
    fl.close()
    fl = open(script_path+"/X_val.pickle",'rb')
    X_val = pickle.load(fl)
    fl.close()
    fl = open(script_path+"/y_val.pickle",'rb')
    y_val = pickle.load(fl)
    fl.close()
    fl = open(script_path+"/X_test.pickle",'rb')
    X_test = pickle.load(fl)
    fl.close()
    fl = open(script_path+"/y_test.pickle",'rb')
    y_test = pickle.load(fl)
    fl.close()
    return X_train,y_train,X_val,y_val,X_test,y_test

def save_model(script_path,model_name,model):
    with open(script_path+"/"+model_name+".pickle",'wb') as fl:
        pickle.dump(model,fl)
    fl.close() 
def load_model(script_path,model_name):
    fl = open(script_path+"/"+model_name+".pickle",'rb')
    model = pickle.load(fl)
    fl.close()
    return model

In [42]:
X_train,y_train,X_val,y_val,X_test,y_test = load_data(script_path)
print 'Train data shape: ', X_train.shape
print 'Train labels shape: ', y_train.shape
print 'Validation data shape: ', X_val.shape
print 'Validation labels shape: ', y_val.shape
print 'Test data shape: ', X_test.shape
print 'Test labels shape: ', y_test.shape

Train data shape:  (301, 34404)
Train labels shape:  (34404,)
Validation data shape:  (301, 11468)
Validation labels shape:  (11468,)
Test data shape:  (301, 11468)
Test labels shape:  (11468,)


In [43]:
def test_with_softmax(X_test,y_test,best_softmax):
    y_test_predict_result = best_softmax.predict(X_test,softmax=True)
    y_test_predict = y_test_predict_result[0]
    test_accuracy = np.mean(y_test == y_test_predict)
    print 'The test accuracy is: %f' % test_accuracy

In [44]:
def tuning_hyperparams(X_train,y_train,X_val,y_val,learning_rates=[1e-6, 1e-8],regularization_strengths=[1e3, 1e5],interval=5,epochs=1000):
    best_eta=0
    best_lambda=0
    best_val = -1
    best_softmax_model = None
    # Choose the best hyperparameters by tuning on the validation set
    i = 0
    for learning_rate in np.linspace(learning_rates[0], learning_rates[1], num=interval):
        i += 1
        print 'The current iteration is %d/%d %f' % (i, interval,learning_rate)
        for reg in np.linspace(regularization_strengths[0], regularization_strengths[1], num=interval):
            print 'Regularization strength at this point ... %f' % (reg)
            sftmax = logistic_regression()
            loss = sftmax.train(X_train, y_train,learning_rate=learning_rate,reg = reg, num_iters=epochs,softmax=True)
            y_val_pred = sftmax.predict(X_val,softmax=True)[0]
            val_accuracy = np.mean(y_val == y_val_pred)
            #results[(learning_rate, reg)] = val_accuracy
            if val_accuracy > best_val:
                best_val = val_accuracy
                best_eta = learning_rate
                best_lambda = reg
                best_softmax_model = sftmax
            else:
                pass
    return best_eta,best_lambda,best_softmax_model 


In [45]:
'''learning_rates=[1e-6, 1e-8]
regularization_strengths=[1e3, 1e5]
num_iters=1000
eta,lambda_,soft_model = tuning_hyperparams(X_train,y_train,X_val,y_val,learning_rates=learning_rates,regularization_strengths=regularization_strengths,epochs=num_iters)'''

'learning_rates=[1e-6, 1e-8]\nregularization_strengths=[1e3, 1e5]\nnum_iters=1000\neta,lambda_,soft_model = tuning_hyperparams(X_train,y_train,X_val,y_val,learning_rates=learning_rates,regularization_strengths=regularization_strengths,epochs=num_iters)'

In [46]:
eta=2.5749999999999986e-07
lambda_ = 50500.0
#print "Best learning_rate: ",eta
#print "Best regularization_rate: ",lambda_
#print "Saving in :",script_path        
#save_model(script_path,"softmax",soft_model)    
soft_model = load_model(script_path,"softmax")
test_with_softmax(X_test,y_test,soft_model)

The test accuracy is: 0.395274


In [47]:
num_classes = len(classes)
#logistic_classifiers = train_one_vs_all(X_train,y_train,learning_rate=eta,reg=lambda_,num_iters=1000)
#print "Saving in :",script_path
#save_model(script_path,"logis",logistic_classifiers)
logistic_classifiers = load_model(script_path,"logis")
pred_test_one_vs_all = predict_one_vs_all(logistic_classifiers, X_test, num_classes)
print 'Test datast accuracy: %f' % (np.mean(y_test == pred_test_one_vs_all))

Test datast accuracy: 0.395535


In [48]:
input_sent = "My name is Khan and I am not a terrorist"
input_sent = input_sent.split()
print "Predicting using Softmax..."
pred_tag_sequence(input_sent,model,soft_model,class_map,model_name="softmax")
print "Predicting using OVA LR..."
pred_tag_sequence(input_sent,model,logistic_classifiers,class_map,model_name="logi")
    
    

Predicting using Softmax...
Predicted Tags:  ['NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN']
Predicting using OVA LR...
Predicted Tags:  ['NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN']
