Deep Learning Programming Assignment 2
--------------------------------------
Name:Vishnu Dutt Sharma
Roll No.: 12EC35018

Submission Instructions:
1. Fill your name and roll no in the space provided above.
2. Name your folder in format <Roll No>_<First Name>.
    For example 12CS10001_Rohan
3. Submit a zipped format of the file (.zip only).
4. Submit all your codes. But do not submit any of your datafiles
5. From output files submit only the following 3 files. simOutput.csv, simSummary.csv, analogySolution.csv
6. Place the three files in a folder "output", inside the zip.

In [2]:
import gzip
import os
import pickle
import numpy as np
import random

from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import KFold

## paths to files. Do not change this
simInputFile = "Q1/word-similarity-dataset"
analogyInputFile = "Q1/word-analogy-dataset"
vectorgzipFile = "Q1/glove.6B.300d.txt.gz"
vectorTxtFile = "Q1/glove.6B.300d.txt"   # If you extract and use the gz file, use this.
analogyTrainPath = "Q1/wordRep/"
simOutputFile = "Q1/simOutput.csv"
simSummaryFile = "Q1/simSummary.csv"
anaSOln = "Q1/analogySolution.csv"
Q4List = "Q4/wordList.csv"




In [4]:
# Similarity Dataset
simDataset = [item.split(" | ") for item in open(simInputFile).read().splitlines()]
# Analogy dataset
analogyDataset = [[stuff.strip() for stuff in item.strip('\n').split('\n')] for item in open(analogyInputFile).read().split('\n\n')]

def vectorExtract(simD = simDataset, anaD = analogyDataset, vect = vectorgzipFile):
    simList = [stuff for item in simD for stuff in item]
    analogyList = [thing for item in anaD for stuff in item[0:6] for thing in stuff.split()]
    simList.extend(analogyList)
    wordList = set(simList)
    print len(wordList)
    wordDict = dict()
    
    vectorFile = gzip.open(vect, 'r')
    for line in vectorFile:
        if line.split()[0].strip() in wordList:
            wordDict[line.split()[0].strip()] = line.split()[1:]
    
    
    vectorFile.close()
    print 'retrieved', len(wordDict.keys())
    return wordDict

# Extracting Vectors from Analogy and Similarity Dataset
validateVectors = vectorExtract()
# print validateVectors

1221
retrieved 1214


In [7]:
# Dictionary of training pairs for the analogy task
trainDict = dict()
for subDirs in os.listdir(analogyTrainPath):
    for files in os.listdir(analogyTrainPath+subDirs+'/'):
        f = open(analogyTrainPath+subDirs+'/'+files).read().splitlines()
        trainDict[files] = f
print len(trainDict.keys())

19


In [4]:
word_vec = dict()

with gzip.open(vectorgzipFile, 'r') as fl:
    for lines in fl:
        vec = lines.split(' ')
        key = (vec[0].strip()).lower()
        val = [float(x) for x in arr[1:]]
        
        word_vec[key] = val
    
fl.close()

In [10]:
def similarityTask(inputDS = simDataset, outputFile = simOutputFile, summaryFile=simSummaryFile, vectors=validateVectors):
    corr_cos = 0
    corr_euc = 0
    corr_man = 0
    mrr_cos = []
    mrr_euc = []
    mrr_man = []
    
    index = 0
    
    sim_fl = open(simOutputFile, 'w')
    for line in simDataset:
        query = np.asarray(validateVectors[line[0]], dtype=float)
                
        flag = 0
        
        cos_vec = []
        euc_vec = []
        man_vec = []
        
        for word in line[1:]:
            try:
                opt_vec = np.asarray(validateVectors[word], dtype=float)
            except:
                flag = 1
                break
                
            cos = np.inner(query, opt_vec)/(np.linalg.norm(query) * np.linalg.norm(opt_vec))
            sim_fl.write("%d,%s,%s,C,%f\n" % (index+1, line[0], word, cos))
            
            euc = np.linalg.norm(query - opt_vec)
            sim_fl.write("%d,%s,%s,E,%f\n" % (index+1, line[0], word, euc))
            
            man = sum(abs(query - opt_vec))
            sim_fl.write("%d,%s,%s,M,%f\n" % (index+1, line[0], word, man))
            
            cos_vec += [cos]
            euc_vec += [euc]
            man_vec += [man]
            
        if flag:
            flag = 0
            continue
        
        index = index + 1
        
        corr = np.argmax(cos_vec)
        if( corr == 0):
            corr_cos += 1
        
        corr = np.argmin(euc_vec)
        if( corr == 0):
            corr_euc += 1
        
        corr = np.argmin(man_vec)
        if( corr == 0):
            corr_man += 1
        
        mrr_cos.append(1.0/(1+np.argsort(cos_vec)[3]))
        mrr_euc.append(1.0/(1+np.argsort(euc_vec)[0]))
        mrr_man.append(1.0/(1+np.argsort(man_vec)[0]))

    sim_fl.close()
    
    mrr_C = sum(mrr_cos)/len(mrr_cos)
    mrr_E = sum(mrr_euc)/len(mrr_euc)
    mrr_M = sum(mrr_man)/len(mrr_man)
    
    sum_fl = open(simSummaryFile, 'w')
    sum_fl.write("C,%d,%d,%f\n" % (corr_cos, len(mrr_cos), mrr_C))
    sum_fl.write("E,%d,%d,%f\n" % (corr_euc, len(mrr_cos), mrr_E))
    sum_fl.write("M,%d,%d,%f\n" % (corr_man, len(mrr_cos), mrr_M))
    sum_fl.close()


# similarityTask()

23 17 17
0.770833333333 0.671296296296 0.662037037037


In [None]:
def analogyTask(inputDS=analogyDataset,outputFile = anaSOln ): # add more arguments if required
    train_X = []
    train_Y = []

    hid_size = 300

    keys = trainDict.keys()

    i = 0
    
    while(i < 5000):
        try:
            index = np.random.randint(19, size = 2)
            
            word_1 = random.choice(trainDict[keys[index[0]]]).split('\t')
            word_2 = random.choice(trainDict[keys[index[1]]]).split('\t')
            
            train_Y += [0]
            train_X += [word2vec[word_1[0].lower()]+ word2vec[word_1[1].lower()]+ word2vec[word_2[0].lower()]+ word2vec[word_2[1].lower()]]
            
            index = np.random.randint(13)
            
            word_1 = random.choice(trainDict[keys[index]]).split('\t')
            word_2 = random.choice(trainDict[keys[index]]).split('\t')
            
            train_Y += [1]
            train_X += [word2vec[word_1[0].lower()] + word2vec[word_1[1].lower()] + word2vec[word_2[0].lower()] + word2vec[word_2[1].lower()]]
            
            i += 1
        
        except:
            continue

            
    accuracy = 0.0

    kf = KFold(n_splits=5)
    epoch = 0
    for train_index, test_index in kf.split(train_X):
        X_train, X_test = np.asarray(train_X)[train_index], np.asarray(train_X)[test_index]
        Y_train, Y_test = np.asarray(train_Y)[train_index], np.asarray(train_Y)[test_index]

        nn = MLPClassifier(solver='lbfgs', alpha=2e-5, hidden_layer_sizes=(hid_size, 1), random_state=1)
        nn.fit(X_train, Y_train)
        
        acc = clf.score(X_test, Y_test)
        
        print 'Epoch: ', epoch+1, ', Acccuracy: ', accuracy, '%'
        
        accuracy += acc
        epoch += 1

    acc = acc/5.0
    print acc


    
    ana_fl = open(anaSOln, 'w')

    inco = 0
    corr = 0
    flag = 0


    for line in analogyDataset:
        ques = (line[0]).split(' ')
        
        X_vec = []
        Y_vec = []

        for ind in range(1,6):
            option = line[ind].split(' ')
            try:
                X_vec += [word2vec[query[0].lower()] + word2vec[query[1].lower()] + word2vec[option[0].lower()] + word2vec[option[1].lower()]]
            except:
                flag = 1

        if flag:
            flag = 0
            continue

        prob = nn.predict_proba(test_X)[1]
        y_hat = np.argmax(prob)
        y = ord(line[-1])-97

        anaOutFile.write(item[0] + ',' + item[-1] + ',' + chr(output+97) + '\n')
        
        if (y_hat == y):
                corr += 1
        else:
                inco += 1

    anaOutFile.close()        

    return float(corr)/(inco+corr)

# analogyTask()

In [56]:
def derivedWordTask(inputFile = Q4List):
#     print 'hello world'
    
    
    """
    Output vectors of 3 files:
    1)AnsFastText.txt - fastText vectors of derived words in wordList.csv
    2)AnsLzaridou.txt - Lazaridou vectors of the derived words in wordList.csv
    3)AnsModel.txt - Vectors for derived words as provided by the model
    
    For all the three files, each line should contain a derived word and its vector, exactly like 
    the format followed in "glove.6B.300d.txt"
    
    word<space>dim1<space>dim2........<space>dimN
    charitably 256.238 0.875 ...... 1.234
    
    """
    
    """
    The function should return 2 values
    1) Averaged cosine similarity between the corresponding words from output files 1 and 3, as well as 2 and 3.
    
        - if there are 3 derived words in wordList.csv, say word1, word2, word3
        then find the cosine similiryt between word1 in AnsFastText.txt and word1 in AnsModel.txt.
        - Repeat the same for word2 and word3.
        - Average the 3 cosine similarity values
        - DO the same for word1 to word3 between the files AnsLzaridou.txt and AnsModel.txt 
        and average the cosine simialities for valuse so obtained
        
    """
    

    fasttext_dict = dict()
    with open('Q4/fastText_vectors.txt', 'r') as f:
        for line in f:

            arr = line.strip().split(' ')
            key = (arr[0].strip()).lower()
            val = [float(x) for x in arr[1:]]

            fasttext_dict[key] = val

    lazaridou_dict = dict()
    with open('Q4/vector_lazaridou.txt', 'r') as f:
        for line in f:

            arr = line.strip().split('[')
            key = (arr[0].strip()).lower()
            arr = (arr[1].strip()[:-1]).split(',')
            val = [float(x) for x in arr[1:]]

            lazaridou_dict[key] = val

    index = []
    affix = []
    derived = []
    root = []

    csvfile = open(Q4List, 'rb')
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    header = spamreader.next()
    for row in spamreader:
        index.append(int(row[0]))
        affix.append(row[1])
        derived.append(row[2])
        root.append(row[3])
    csvfile.close()

    name = list(set(affix))
    aff_dict = dict(zip(name, range(len(name))))

    h_size  = 300

    train_X_raw_ft = []
    train_Y_raw_ft = []
    train_X_ft = []
    train_Y_ft = []
    for i in range(len(root)):
        try:
            zer = [0]*len(name)
            zer[aff_dict[affix[i]]] = 1
            train_X_raw_ft += [[1]+ fasttext_dict[root[i]] + zer]
            train_Y_raw_ft += [fasttext_dict[derived[i]]]
        except:
            continue

    index_shuf = range(len(train_Y_raw_ft))
    random.shuffle(index_shuf)
    for i in index_shuf:
        train_X_ft.append(train_X_raw_ft[i])
        train_Y_ft.append(train_Y_raw_ft[i])

    train_X_raw_lz = []
    train_Y_raw_lz = []
    train_X_lz = []
    train_Y_lz = []                      
    for i in range(len(root)):
        try:
            zer = [0]*len(name)
            zer[aff_dict[affix[i]]] = 1
            train_X_raw_lz += [[1]+ lazaridou_dict[root[i]] + zer]
            train_Y_raw_lz += [lazaridou_dict[derived[i]]]
        except:
            continue

    index_shuf = range(len(train_Y_raw_lz))
    random.shuffle(index_shuf)
    for i in index_shuf:
        train_X_lz.append(train_X_raw_lz[i])
        train_Y_lz.append(train_Y_raw_lz[i])

    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(train_X_ft):
    #         print len(train_index)
    #         print len(test_index)
            X_train, X_test = np.asarray(train_X_ft)[train_index], np.asarray(train_X_ft)[test_index]
            Y_train, Y_test = np.asarray(train_Y_ft)[train_index], np.asarray(train_Y_ft)[test_index]
    #         print X_train[1:10]
    #         print X_test[1:10]


            mlp_ft = MLPRegressor(solver='lbfgs', hidden_layer_sizes=300, max_iter=150, shuffle=True, random_state=1, activation='relu')
            mlp_ft.fit(X_train, Y_train)
            score = mlp_ft.score(X_test, Y_test)
#             print 'Score ft: ',score


    for train_index, test_index in kf.split(train_X_lz):
    #         print len(train_index)
    #         print len(test_index)
            X_train, X_test = np.asarray(train_X_lz)[train_index], np.asarray(train_X_lz)[test_index]
            Y_train, Y_test = np.asarray(train_Y_lz)[train_index], np.asarray(train_Y_lz)[test_index]
    #         print X_train[1:10]
    #         print X_test[1:10]


            mlp_lz = MLPRegressor(solver='lbfgs', hidden_layer_sizes=450, max_iter=150, shuffle=True, random_state=1, activation='relu')
            mlp_lz.fit(X_train, Y_train)
            score = mlp_lz.score(X_test, Y_test)
#             print 'Score lz: ',score

    #         lz_clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(h_size, 1), random_state=1)
    #         lz_clf.fit(X_train, Y_train)



    ft_file = open('Q4/AnsFastText.txt', 'w')
    lz_file = open('Q4/AnsLzaridou.txt', 'w')
    mod_ft_file = open('Q4/AnsModel.txt', 'w')
#     mod_lz_file = open('Q4/AnsModelLZ.txt', 'w')

    ft_c = 0.0
    lz_c = 0.0
    tot = 0
    for i in range(len(root)):
        ft_file.write(derived[i])
        out_1_ft = fasttext_dict[derived[i]]
        for k in out_1_ft:
            ft_file.write(' '+str(k))
        ft_file.write('\n')

        lz_file.write(derived[i])
        out_1_lz = lazaridou_dict[derived[i]]
        for k in out_1_lz:
            lz_file.write(' '+str(k))
        lz_file.write('\n')

        zer = [0]*len(name)
        zer[aff_dict[affix[i]]] = 1
        train_X = [[1]+ fasttext_dict[root[i]] + zer]

        mod_ft_file.write(derived[i])

        out_2_ft = mlp_ft.predict(train_X)

        for k in out_2_ft:
            mod_ft_file.write(' '+str(k))
        mod_ft_file.write('\n')

        zer = [0]*len(name)
        zer[aff_dict[affix[i]]] = 1
        train_X = [[1]+ lazaridou_dict[root[i]] + zer]

#         mod_lz_file.write(derived[i])

        out_2_lz = mlp_lz.predict(train_X)
#         for k in out_2_lz:
#             mod_lz_file.write(' '+str(k))
#         mod_lz_file.write('\n')

        cos_ft = np.inner(np.asarray(out_1_ft), out_2_ft)/(np.linalg.norm(np.asarray(out_1_ft)) * np.linalg.norm(out_2_ft))
        cos_lz = np.inner(np.asarray(out_1_lz), out_2_lz)/(np.linalg.norm(np.asarray(out_1_lz)) * np.linalg.norm(out_2_lz))

        ft_c += cos_ft
        lz_c += cos_lz
        tot += 1


    ft_file.close()
    lz_file.close()
    mod_ft_file.close()
#     mod_lz_file.close()

    cosVal1 = ft_c/tot
    cosVal2 = lz_c/tot

    return cosVal1, cosVal2
 

In [57]:
def main():
    similarityTask()
    anaSim = analogyTask()
    derCos1,derCos2 = derivedWordTask()
    print anaSim
    print derCos1, derCos2
    
if __name__ == '__main__':
    main()


[ 0.85742137] [ 0.32482805]


In [58]:

# word2vec = pickle.load(open('Q1/word2vec.p', 'rb'))

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000


In [30]:
# train_X_raw = []
# train_Y_raw = []
# train_X = []
# train_Y = []

# keys = trainDict.keys()


# for i in range(20000):
#     try:
#         arr = np.random.randint(13, size = 2)
#         word_1 = random.choice(trainDict[keys[arr[0]]]).split('\t')
#         word_2 = random.choice(trainDict[keys[arr[1]]]).split('\t')
#         train_X_raw += [[1]+ word2vec[word_1[0].lower()]+ word2vec[word_1[1].lower()]+ word2vec[word_2[0].lower()]+ word2vec[word_2[1].lower()]]
#         train_Y_raw += [0]
#     except:
#         continue

# for i in range(20000):
#     try:
#         arr = np.random.randint(13)
#         word_1 = random.choice(trainDict[keys[arr]]).split('\t')
#         word_2 = random.choice(trainDict[keys[arr]]).split('\t')
#         train_X_raw += [[1] + word2vec[word_1[0].lower()] + word2vec[word_1[1].lower()] + word2vec[word_2[0].lower()] + word2vec[word_2[1].lower()]]
#         train_Y_raw += [1]
#     except:
#         continue
        
# index_shuf = range(len(train_Y_raw))
# random.shuffle(index_shuf)
# for i in index_shuf:
#     train_X.append(train_X_raw[i])
#     train_Y.append(train_Y_raw[i])

In [31]:
# print len(train_X)
# print len(train_Y)
# print train_Y[1:100]

In [32]:
# # Layer's sizes
# x_size = 4 * 300 + 1   # Number of input nodes: 4 features and 1 bias

# h_size = 300                # Number of hidden nodes
# y_size = 1   # Number of outcomes (3 iris flowers)

# train_size = len(train_Y)

# clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(h_size, 1), random_state=1)

# clf.fit(train_X, train_Y)

In [33]:
# neg = 0
# pos = 0
# flag = 0
# for item in analogyDataset:
#     query = item[0].split(' ')
#     test_X = []
#     test_Y = []
    
    
#     for num in range(1,6):
#         option = item[num].split(' ')
#         try:
#             test_X += [[1] + word2vec[query[0].lower()] + word2vec[query[1].lower()] + word2vec[option[0].lower()] + word2vec[option[1].lower()]]
#         except:
#             test_X += [[0.0]*(4*300+1)]
    
#     output = np.argmax(clf.predict_proba(test_X)[1])
#     test_Y = ord(item[-1])-97
    
# #     print output
# #     print test_Y
    
#     if (output == test_Y):
#             neg += 1
#     else:
#             pos += 1

            
# print neg
# print pos
# print float(pos)/(pos+neg)

In [34]:
# a = np.asarray([0.2, 0.5, 0.1, 0.3])
# b = np.argsort(a)
# print b
# print np.sort(a)
# print 1+np.where(b==0)[0][0]
# print 1+np.where(b==1)[0][0]
# print 1+np.where(b==2)[0][0]
# print 1+np.where(b==3)[0][0]