# Extract patterns from Distributional Memory 

In [4]:
from base import extract_wp
import numpy as np
#extract word pairs from training and testing dataset
word_pairs_dir_test = 'SemEval-2012-Complete-Data-Package/Testing/Phase1Answers/'
wp_test = extract_wp(word_pairs_dir_test)
word_pairs_dir_train = 'SemEval-2012-Complete-Data-Package/Training/Phase1Answers/'
wp_train = extract_wp(word_pairs_dir_train)
wp = wp_test + wp_train

3297 2860 437


In [2]:
from datetime import datetime
print 'Start: '+ str(datetime.now())
#extract links from DM
dm = open('typedm.txt', 'r')
dm_filter = open('typedm_filter.txt', 'w')
i = 0
for line in dm:
    line_split = line.split('\t')
    w1 = line_split[0][:len(line_split[0])-2]
    w2 = line_split[2][:len(line_split[2])-2]
    pair1 = [w1,w2]
    #not needed: inverse link constraints
    pair2 = [w2,w1]
    if pair1 in wp or pair2 in wp:
        dm_filter.write(w1 + '\t' + line_split[1] + '\t'+ w2 + '\t' + line_split[3])
    i += 1
dm.close()
dm_filter.close()
print 'End: '+ str(datetime.now())

Start: 2017-02-03 10:14:08.652871
End: 2017-02-03 17:50:02.032837


In [6]:
#filter out empty word pairs
wp_filter = []
for p in wp:
    if len(p) == 2:
        w1 = p[0]
        w2 = p[1]
        p = '"'+ w1+ ':'+ w2 + '"'
        wp_filter.append(p)
wp = wp_filter

In [7]:
#remove repetitions in the word pairs
wp = set(wp)
print len(wp)

3078


In [17]:
#store only entries in DM with word pairs in the dataset
dm_filter = open('typedm_filter.txt', 'r')
patterns = set()
pairs_dm = set()

#store patterns and pairs in DM 
for line in dm_filter:
    sp = line.split('\t')
    pattern = sp[1]
    patterns.add(pattern)
    w1 = sp[0]
    w2 = sp[2]
    pair = '"'+ w1+ ':'+ w2 + '"'
    pairs_dm.add(pair)
pairs_dm = list(pairs_dm.intersection(wp))
wp = list(wp)
patterns = list(patterns)
dm_filter.close()
print 'Pairs not in DM:' + str(len(wp) - len(pairs_dm))
#dictionary with word pairs and patterns indeces in the matrix: key: string, value:index
pair2index = dict(zip(pairs_dm, xrange(len(wp))))

i = 0
pat2index = dict(zip(patterns, xrange(len(patterns))))
dm_filter = open('typedm_filter.txt', 'r')

#matrix with feature vectors (rows: pairs in DM, cols: patterns in DM)
feature_vectors = np.zeros(shape=(len(pairs_dm),len(patterns)))
for line in dm_filter:
    sp = line.split('\t')
    w1 = sp[0]
    w2 = sp[2]
    pair = '"'+ w1+ ':'+ w2 + '"'
    if pair in pairs_dm:
        row = pair2index[pair]
        col = pat2index[sp[1]]
        feature_vectors[row,col] = float(sp[3])
print feature_vectors.shape, len(wp)

Pairs not in DM:1104
(1974, 1440) 3078


## Saving objects in pickle format

In [18]:
import pickle
#store feature vectors in pickle file and dictionaries for rows and cols

pickle.dump(feature_vectors, open('dm_feature_matrix.pkl','wb'))
pickle.dump(pair2index, open('dm_pair2index.pkl', 'wb'))
pickle.dump(pat2index, open('dm_pat2index.pkl', 'wb'))

# Logit model

In [6]:
#Classifiers
import pickle
from classifier import extract_labels, logreg_classify_pairs

feature_vectors = pickle.load(open('dm_feature_matrix.pkl', 'rb'))
pair2index = pickle.load(open('dm_pair2index.pkl', 'rb'))
pat2index = pickle.load(open('dm_pat2index.pkl', 'rb'))

semrel_folder_train = 'SemEval-2012-Complete-Data-Package/Training/Phase1Answers'
semrel_folder_test = 'SemEval-2012-Complete-Data-Package/Testing/Phase1Answers'

#store labels for each sem.rel./classifier

y_train= extract_labels(semrel_folder_train, pair2index)[0]
y_test = extract_labels(semrel_folder_test, pair2index)[0]

In [7]:
logreg_classify_pairs(semrel_folder_train, feature_vectors, pair2index, pat2index)

Semantic relation: 7a
0.613636363636
Semantic relation: 10a
0.860465116279
Semantic relation: 1a
0.90243902439
Semantic relation: 5i
0.681818181818
Semantic relation: 2c
0.627906976744
Semantic relation: 3c
0.317073170732
Semantic relation: 5d
0.295454545455
Semantic relation: 3a
0.604651162791
Semantic relation: 2h
0.439024390244
Semantic relation: 4c
0.627906976744


In [8]:
logreg_classify_pairs(semrel_folder_test, feature_vectors, pair2index, pat2index)

Semantic relation: 9i
0.846153846154
Semantic relation: 7f
0.894736842105
Semantic relation: 7e
0.864864864865
Semantic relation: 7d
0.931818181818
Semantic relation: 7c
0.641025641026
Semantic relation: 7b
0.941176470588
Semantic relation: 9h
0.790697674419
Semantic relation: 9a
0.666666666667
Semantic relation: 10c
0.90243902439
Semantic relation: 9c
0.62962962963
Semantic relation: 9b
0.738095238095
Semantic relation: 9e
0.860465116279
Semantic relation: 8h
0.720930232558
Semantic relation: 10d
0.853658536585
Semantic relation: 9f
0.775
Semantic relation: 1d
0.116279069767
Semantic relation: 5b
0.225
Semantic relation: 4h
0.744186046512
Semantic relation: 8e
0.85
Semantic relation: 7g
0.794117647059
Semantic relation: 10b
0.725
Semantic relation: 1c
0.97619047619
Semantic relation: 3h
0.860465116279
Semantic relation: 1e
0.212121212121
Semantic relation: 5h
0.533333333333
Semantic relation: 5c
0.80487804878
Semantic relation: 5e
0.95
Semantic relation: 3b
0.790697674419
Semantic rel

# Store data in .csv format

In [9]:
#get data in csv format
import csv
from os import stat, mkdir, path


train_dir = 'dm_training_csv'
try:
    stat(train_dir)
except:
    mkdir(train_dir)
for rel in y_train:
    with open(path.join(train_dir, 'dm_featurevectors_'+rel+'_train.csv'), 'wb') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        for p in pair2index:
            vector = []
            for weight in feature_vectors[pair2index[p],]:
                vector.append(str(weight))
            label = y_train[rel][pair2index[p]]
            writer.writerow([p] +  vector + [str(label)])
            
test_dir = 'dm_testing_csv'
try:
    stat(test_dir)
except:
    mkdir(test_dir)   
for rel in y_test:
    with open(path.join(test_dir, 'dm_featurevectors_'+rel+'_test.csv'), 'wb') as csvfile:
        writer = csv.writer(csvfile, delimiter= ',')
        for p in pair2index:
            vector = []
            for weight in feature_vectors[pair2index[p],]:
                vector.append(str(weight))
            label = y_test[rel][pair2index[p]]
            writer.writerow([p] +  vector + [str(label)])