In [1]:
import codecs
import random
from nltk.stem.porter import PorterStemmer as PS
from collections import Counter
import numpy as np

In [2]:
fname_pos = 'rt-polaritydata/rt-polaritydata/rt-polarity.pos'
fname_neg = 'rt-polaritydata/rt-polaritydata/rt-polarity.neg'
fname_smt = 'sentiment.txt'
fencoding='cp1252'

In [3]:
# 71

In [4]:
result=[]

In [5]:
with codecs.open(fname_pos,'r',fencoding) as file_pos:
    result.extend(['+1 {}'.format(line.strip()) for line in file_pos])

with codecs.open(fname_neg,'r', fencoding) as file_neg:
    result.extend(['-1 {}'.format(line.strip()) for line in file_neg])
    

In [6]:
random.shuffle(result)

In [7]:
with codecs.open(fname_smt,'w',fencoding) as file_out:
    print(*result,sep='\n',file=file_out)

In [8]:
cnt_pos=0
cnt_neg=0

In [9]:
with codecs.open(fname_smt,'r',fencoding) as file_out:
    for line in file_out:
        if line.startswith('+1'):
            cnt_pos+=1
        elif line.startswith('-1'):
            cnt_neg+=1
print("pos:{}, neg:{}".format(cnt_pos,cnt_neg))

pos:5331, neg:5331


# 71

In [10]:
stop_words = (
    'a,able,about,across,after,all,almost,also,am,among,an,and,any,are,'
    'as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,'
    'either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,'
    'him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,'
    'likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,'
    'on,only,or,other,our,own,rather,said,say,says,she,should,since,so,'
    'some,than,that,the,their,them,then,there,these,they,this,tis,to,too,'
    'twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,'
    'will,with,would,yet,you,your').lower().split(',')

In [11]:
def is_stopword(str):
    '''
    文字がストップワードかどうかをboolで返す
    '''
    return str.lower() in stop_words

In [12]:
# 正しく検出されることのテスト
assert is_stopword('a')             # リストの先頭
assert is_stopword('your')          # リストの末尾
assert is_stopword('often')         # リストの中間
assert is_stopword('on')            # リストの中間
assert is_stopword('A')             # 大小文字の同一視
assert is_stopword('Your')          # 大小文字の同一視
assert is_stopword('ofteN')         # 大小文字の同一視
assert is_stopword('ON')            # 大小文字の同一視

# 誤検出されないことのテスト
assert not is_stopword('0')         # リストにない
assert not is_stopword('z')         # リストにない
assert not is_stopword('bout')      # 後方一致されない
assert not is_stopword('acros')     # 前方一致されない
assert not is_stopword('fte')       # 中間一致されない
assert not is_stopword(' ')         # 空白
assert not is_stopword('\n')        # 制御コード
assert not is_stopword('')          # 空文字

# 72

In [13]:
fname_sentiment = 'sentiment.txt'
fname_features = 'features.txt'
fname_theta='theta.npy'

In [14]:
#素性抽出
ps=PS()
word_counter = Counter()

In [15]:
with codecs.open(fname_sentiment,'r',fencoding) as file_in:
    for line in file_in:
        for word in line[3:].split(' '):
            word = word.strip()
            
            if is_stopword(word):
                continue
            else:
                word = ps.stem(word)
                
                if word != '!' and word != '?' and len(word) <=1:
                    continue
                word_counter.update([word])
features = [word for word,count in word_counter.items() if count >= 6]
    
with codecs.open(fname_features,'w',fencoding) as file_out:
    print(*features,sep='\n',file=file_out)
                

In [16]:
! head -n 100 features.txt

dumb
exploit
violenc
iron
becom
everyth
clumsi
origin
against
exercis
chill
style
film
insid
out
eye
sens
mysteri
frailti
offer
much
those
sit
around
midnight
tell
creepi
stori
give
each
willi
there'
way
won't
talk
onc
theater
parker
display
play
class
wild
himself
littl
question
seriou
work
import
director
someth
new
reel
love
david
one
told
entir
point
view
movi
such
excel
job
itself
develop
critic
feel
more
crush
worst
man
made
women
slow
silli
unintent
hilari
oddli
rivet
documentari
piano
teacher
titl
charact
control
?
certainli
decad
life
classic
franchis
let'
hope
problem
dramat
premis
mr
content
state
construct


In [17]:
learn_alpha=6.0
learn_count=1000

In [18]:
def hypothesis(data_x,theta):
    '''
    仮説関数
    data_xからdata_yを予測'''
    return 1.0/(1.0+np.exp(-data_x.dot(theta)))

In [19]:
def cost(data_x,theta,data_y):
    '''
    目的関数
    data_yに対して予測した結果と正解の差を算出'''
    m = data_y.size
    h = hypothesis(data_x,theta)
    j = 1/ m*np.sum(-data_y*np.log(h) - (np.ones(m)-data_y) * np.log(np.ones(m)-h))
    
    return j

In [20]:
def gradient(data_x,theta,data_y):
    '''
    最急降下法における勾配の算出
    '''
    
    m = data_y.size
    h= hypothesis(data_x,theta)
    grad = 1/m*(h-data_y).dot(data_x)
    return grad

In [21]:
def extract_features(data,dict_features):
    '''
    文章から素性を抽出
    '''
    data_one_x = np.zeros(len(dict_features)+1,dtype=np.float64)
    data_one_x[0]=1
    
    for word in data.split(' '):
        word = word.strip()
        if is_stopword(word):
            continue
        word = ps.stem(word)
        
        try:
            data_one_x[dict_features[word]]=1
        except:
            pass
    return data_one_x

In [22]:
def load_dict_features():
    '''
    素性をインデックスに変換する辞書
    '''
    with codecs.open(fname_features,'r',fencoding) as file_in:
        return {line.strip(): i for i,line in enumerate(file_in,start=1)}

In [23]:
def create_training_set(sentiments,dict_features):
    '''
    学習対象の行列と、極性ラベルの行列をreturn
    '''
    data_x = np.zeros([len(sentiments),len(dict_features)+1],dtype=np.float64)
    data_y = np.zeros(len(sentiments),dtype=np.float64)
    
    for i, line in enumerate(sentiments):
        data_x[i] = extract_features(line[3:],dict_features)
        if line[0:2]=='+1':
            data_y[i]=1
    return data_x,data_y

In [24]:
def learn(data_x,data_y,alpha,count):
    '''
    logistic reg の学習
    '''
    theta = np.zeros(data_x.shape[1])
    c = cost(data_x,theta,data_y)
    print('\t学習かいし\t cost:{}'.format(c))
    
    for i in range(1,count+1):
        grad = gradient(data_x,theta,data_y)
        theta -= alpha*grad
        
        if i %100==0:
            c = cost(data_x,theta,data_y)
            e = np.max(np.absolute(alpha * grad))
            
            print('\t学習中(#{})\tcost:{}\tE:{}'.format(i,c,e))
    c = cost(data_x,theta,data_y)
    e = np.max(np.absolute(alpha*grad))
    print("\t学習完了(#{})\t cost:{}\tE:{}".format(i,c,e))
    return theta

In [25]:
dict_features = load_dict_features()

In [26]:
with codecs.open(fname_sentiment,'r',fencoding) as file_in:
    data_x,data_y = create_training_set(list(file_in),dict_features)
    
print("学習率:{}\t学習繰り返し数:{}".format(learn_alpha,learn_count))
theta = learn(data_x,data_y,alpha=learn_alpha,count=learn_count)

np.save(fname_theta,theta)

学習率:6.0	学習繰り返し数:1000
	学習かいし	 cost:0.6931471805599453
	学習中(#100)	cost:0.4867235807923767	E:0.006382759714404753
	学習中(#200)	cost:0.4378801012698738	E:0.0035625972729612055
	学習中(#300)	cost:0.4102695665075672	E:0.0025672312512357827
	学習中(#400)	cost:0.39138225593804876	E:0.0021004322327511297
	学習中(#500)	cost:0.3772327785443585	E:0.0017741643510842176
	学習中(#600)	cost:0.36604076797985047	E:0.0015494001749209014
	学習中(#700)	cost:0.35685875708950915	E:0.001393726759405461
	学習中(#800)	cost:0.34912420652102616	E:0.0012703810950406992
	学習中(#900)	cost:0.34247693043489486	E:0.001169977113623327
	学習中(#1000)	cost:0.3366731553338597	E:0.0010864905858590868
	学習完了(#1000)	 cost:0.3366731553338597	E:0.0010864905858590868


# 74

In [27]:
dict_features = load_dict_features()
theta = np.load(fname_theta)

In [None]:
review = input()

In [None]:
data_one_x = extract_features(review,dict_features)

In [30]:
h = hypothesis(data_one_x,theta)
print("h->",h)
if h>0.5:
    print("label+1")
else:
    print("label:-1")

h-> 0.4234288486372619
label:-1


# 75

In [31]:
with codecs.open(fname_features,'r',fencoding) as file_in:
    feature = list(file_in)
theta = np.load(fname_theta)
index_sorted = np.argsort(theta)

In [32]:
print('top 10')
for index in index_sorted[:-11:-1]:
    print('\t{}\t{}'.format(theta[index],features[index-1].strip() if index >0 else '(none)'))

top 10
	2.3233484565362064	engross
	2.30648948096215	refresh
	2.0010357552489624	unexpect
	1.8598244770606749	remark
	1.7798733743017685	examin
	1.7099157134404346	resist
	1.6947156223762303	captur
	1.6712997670895995	delight
	1.6536397975294048	confid
	1.6223701025900543	refreshingli


In [33]:
print('worts 10')
for index in index_sorted[:10]:
    print('\t{}\t{}'.format(theta[index],features[index-1].strip() if index >0 else '(none)'))

worts 10
	-2.6758048728517405	bore
	-2.4008820178245975	dull
	-2.2153564633347123	wast
	-2.103540416702802	fail
	-2.0395296543425876	badli
	-2.0001748002734248	worst
	-1.9471133426132452	flat
	-1.9437641860859185	plod
	-1.9361962415934924	mediocr
	-1.9274423570723673	routin


# 76

In [34]:
fname_result="result.txt"

In [35]:
dict_features=load_dict_features()
theta = np.load(fname_theta)

In [36]:
with codecs.open(fname_sentiment,'r',fencoding) as file_in:
    with open(fname_result,'w' ) as file_out:
        for line in file_in:
            data_one_x = extract_features(line[3:],dict_features)
            
            h = hypothesis(data_one_x,theta)
            if h>0.5:
                file_out.write("{}\t{}\t{}\n".format(line[0:2],'+1',h))
            else:
                file_out.write("{}\t{}\t{}\n".format(line[0:2],'-1',h))

# 77

In [37]:
! head result.txt

-1	-1	0.02184241302738834
+1	+1	0.8399349925940273
+1	+1	0.9524430656464458
+1	+1	0.5211158153710528
+1	+1	0.7148875324972783
-1	+1	0.7742974466112117
-1	-1	0.18509774706230664
-1	-1	0.04202627002553687
-1	-1	0.2342336275938971
+1	+1	0.9085934249665856


In [38]:
def score(fname):
    TP = 0      # True-Positive     予想が+1、正解も+1
    FP = 0      # False-Positive    予想が+1、正解は-1
    FN = 0      # False-Negative    予想が-1、正解は+1
    TN = 0      # True-Negative     予想が-1、正解も-1
    with open(fname) as data_file:
        for line in data_file:
            cols = line.split('\t')
            
            if len(cols)<3:
                continue
            if cols[0]=='+1':
                if cols[1]=='+1':
                    TP+=1
                else:
                    FN+=1
            else:
                if cols[1]=="+1":
                    FP+=1
                else:
                    TN +=1
                    
    accuracy = (TP+TN) / (TP+FP+FN+TN)#正解率
    precision = TP/(TP+FP)#適合りつ
    recall = TP/(TP+FN)#再現率
    f1 = (2*recall*precision) /(recall+precision)#F1
    return accuracy,precision,recall,f1

In [39]:
print('正解率　\t{}\n適合率　\t{}\n再現率　\t{}\nF1スコア　\t{}'.format(*score(fname_result)))

正解率　	0.8643781654473832
適合率　	0.8657503295048014
再現率　	0.8625023447758394
F1スコア　	0.8641232850967863


# 78

In [40]:
dict_features = load_dict_features()

In [41]:
with codecs.open(fname_sentiment,'r',fencoding) as file_in:
    sentiments_all = list(file_in)

In [42]:
division=5

In [43]:
sentiments = []
unit = int(len(sentiments_all) / division)
for i in range(5):
    sentiments.append(sentiments_all[i+unit:(i+1)*unit])

In [44]:
with open(fname_result,'w' ) as file_out:
    for i in range(division):
        print("{}/{}".format(i+1,division))
        
        data_learn=[]
        for j in range(division):
            if i==j:
                data_validation = sentiments[j]
            else:
                data_learn += sentiments[j]
        
        data_x,data_y = create_training_set(data_learn,dict_features)
        theta = learn(data_x,data_y,alpha=learn_alpha,count=learn_count)
        
        for line in data_validation:
            data_one_x = extract_features(line[3:],dict_features)
            
            h = hypothesis(data_one_x,theta)
            if h > 0.5:
                file_out.write('{}\t{}\t{}\n'.format(line[0:2], '+1', h))
            else:
                file_out.write('{}\t{}\t{}\n'.format(line[0:2], '-1', 1 - h))
print('正解率　\t{}\n適合率　\t{}\n再現率　\t{}\nF1スコア　\t{}'.format(*score(fname_result)))

1/5
	学習かいし	 cost:0.6931471805599453
	学習中(#100)	cost:0.47300235702049015	E:0.00635449496408459


KeyboardInterrupt: 