-
Notifications
You must be signed in to change notification settings - Fork 0
/
q4_sentiment.py
120 lines (102 loc) · 3.78 KB
/
q4_sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import numpy as np
import matplotlib.pyplot as plt
from big_data.data_utils import *
from q3_sgd import load_saved_params, sgd
from q4_softmaxreg import softmaxRegression, getSentenceFeature, accuracy, softmax_wrapper
# Try different regularizations and pick the best!
# NOTE: fill in one more "your code here" below before running!
REGULARIZATION = None # Assign a list of floats in the block below
### YOUR CODE HERE
REGULARIZATION = [0.0, 0.0000001, 0.000001, 0.00001, 0.00007, 0.00009, 0.000095,0.000098,0.0001,0.00012, 0.00015, 0.0002, 0.0003,0.0007, 0.001, 0.0013,0.01, 0.03,0.07,0.1]
### END YOUR CODE
# Load the dataset
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)
# Load the word vectors we trained earlier
_, wordVectors0, _ = load_saved_params()
wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:])
dimVectors = wordVectors.shape[1]
# Load the train set
trainset = dataset.getTrainSentences()
nTrain = len(trainset)
trainFeatures = np.zeros((nTrain, dimVectors))
trainLabels = np.zeros((nTrain,), dtype=np.int32)
for i in xrange(nTrain):
words, trainLabels[i] = trainset[i]
trainFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)
# Prepare dev set features
devset = dataset.getDevSentences()
nDev = len(devset)
devFeatures = np.zeros((nDev, dimVectors))
devLabels = np.zeros((nDev,), dtype=np.int32)
for i in xrange(nDev):
words, devLabels[i] = devset[i]
devFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)
# Try our regularization parameters
results = []
for regularization in REGULARIZATION:
random.seed(3141)
np.random.seed(59265)
weights = np.random.randn(dimVectors, 5)
print "Training for reg=%f" % regularization
# We will do batch optimization
weights = sgd(lambda weights: softmax_wrapper(trainFeatures, trainLabels,
weights, regularization), weights, 3.0, 10000, PRINT_EVERY=100)
# Test on train set
_, _, pred = softmaxRegression(trainFeatures, trainLabels, weights)
trainAccuracy = accuracy(trainLabels, pred)
print "Train accuracy (%%): %f" % trainAccuracy
# Test on dev set
_, _, pred = softmaxRegression(devFeatures, devLabels, weights)
devAccuracy = accuracy(devLabels, pred)
print "Dev accuracy (%%): %f" % devAccuracy
# Save the results and weights
results.append({
"reg" : regularization,
"weights" : weights,
"train" : trainAccuracy,
"dev" : devAccuracy})
# Print the accuracies
print ""
print "=== Recap ==="
print "Reg\t\tTrain\t\tDev"
for result in results:
print "%E\t%f\t%f" % (
result["reg"],
result["train"],
result["dev"])
print ""
# Pick the best regularization parameters
BEST_REGULARIZATION = None
BEST_WEIGHTS = None
### YOUR CODE HERE
print("start")
best = 0
for result in results:
if result["dev"] > best:
BEST_REGULARIZATION = result["reg"]
BEST_WEIGHTS = result["weights"];
best = result["dev"];
print("complete")
### END YOUR CODE
# Test your findings on the test set
testset = dataset.getTestSentences()
nTest = len(testset)
testFeatures = np.zeros((nTest, dimVectors))
testLabels = np.zeros((nTest,), dtype=np.int32)
for i in xrange(nTest):
words, testLabels[i] = testset[i]
testFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)
_, _, pred = softmaxRegression(testFeatures, testLabels, BEST_WEIGHTS)
print "Best regularization value: %E" % BEST_REGULARIZATION
print "Test accuracy (%%): %f" % accuracy(testLabels, pred)
# Make a plot of regularization vs accuracy
plt.plot(REGULARIZATION, [x["train"] for x in results])
plt.plot(REGULARIZATION, [x["dev"] for x in results])
plt.xscale('log')
plt.xlabel("regularization")
plt.ylabel("accuracy")
plt.legend(['train', 'dev'], loc='upper left')
plt.savefig("q4_reg_v_acc.png")
plt.show()