### 4-2辭典法

#### 建立正負向情感字典

In [1]:
import pandas as pd

sentiment_df = pd.read_csv('SentimentDict.csv')  

positives_set = set(sentiment_df['positive'])

negatives_set = set(sentiment_df['negative'])

not_set = set(sentiment_df['not'])

#### 建立程度詞字典

In [2]:
degree_dict = {}

for word in sentiment_df['degree-1']:
    degree_dict[word] = 1.8
    
for word in sentiment_df['degree-2']:
    degree_dict[word] = 1.6

for word in sentiment_df['degree-3']:
    degree_dict[word] = 1.4

for word in sentiment_df['degree-4']:
    degree_dict[word] = 1.2

for word in sentiment_df['degree-5']:
    degree_dict[word] = 1.1

for word in sentiment_df['degree-6']:
    degree_dict[word] = 1.5


#### 安裝結巴 

In [3]:
!pip install jieba

import jieba

jieba.load_userdict('dict.txt.big')



Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\user\AppData\Local\Temp\jieba.cache
Loading model cost 1.978 seconds.
Prefix dict has been built successfully.


#### 有否定詞嗎

In [4]:
def hasOpposite(wordlist):
    opp = False

    for word in wordlist:
        if word in not_set:
            opp = True
    
    return opp

#### 取得degree

In [5]:
def getDegree(wordlist):
    degree = 1.0
    for word in wordlist:
        if word in degree_dict:
            degree = degree_dict[word]
    
    return degree

#### 打分數

In [6]:
def analyze (text):
    token = list(jieba.cut(text))
    print(token)
    sum  = 0 
    
    for word in token:
        if word.lower() in positives_set:
            sum += 1
        elif word.lower() in negatives_set:
            sum -= 1
    
    if hasOpposite(token):
        sum = -sum
    sum = sum*getDegree(token)
    
    return sum

#### 分析情感

In [7]:
def sentiment_analysis(text):
    
    sentiment= ""
    
    score =  analyze(text)
    
    print('情感分數:',score)
    
    if score > 0:
        sentiment = "正向"
    elif score < 0.0:
        sentiment = "負向"
        
    print('感情為:',sentiment)
    

In [8]:
text = '我很不高興'
sentiment_analysis(text)

['我', '很', '不', '高興']
情感分數: -1.6
感情為: 負向


In [9]:
text = '你是否同意以「平均每年至少降低 1%」之方式逐年降低火力發電廠發電量?'
sentiment_analysis(text)

['你', '是否', '同意', '以', '「', '平均', '每年', '至少', '降低', ' ', '1%', '」', '之', '方式', '逐年', '降低', '火力發電廠', '發電量', '?']
情感分數: 1.0
感情為: 正向


#### 4-3分類法

In [10]:
!pip install jieba
import jieba
jieba.load_userdict('dict.txt.big')



In [12]:
import pandas as pd
df = pd.read_excel('cvat1.xlsx')


def segment(text):
    seg = ' '.join(jieba.cut(text))
    return seg

df['text'] = df['text'].apply(segment)

In [13]:
df.head(5)

Unnamed: 0,No.,text,sentiment
0,1322,昨天 以三比 一 （ 五 戰三勝 賽制 ） 打敗 韓國 AzubuFrost 隊 ， 拿下 ...,1
1,1503,《 英雄 聯盟 》 Season2 世界 冠軍賽 十月 十四日 的 總冠軍 賽 ， 臺灣 戰...,1
2,607,環境 非常 非常 好 ， 庭院式 小棟 別墅 ， 雖然 說 酒店 年限 很久 了 ， 據說 ...,1
3,105,這是 一本 非常 值得 推薦 的 一 本書 ， 我 讀後 收益 頗 多 ， 並且 已 推薦 ...,1
4,166,我 已經 能 想象 到 小侄女 拿到 書 的 享受 樣子 了 ， 呵呵 ， 贈人 好書 ， ...,1


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

X_train_raw, X_test_raw, y_train, y_test = train_test_split(df['text'],df['sentiment'], test_size=0.3, random_state=2)

vectorizer = TfidfVectorizer()

classifier = LogisticRegression()

X_train = vectorizer.fit_transform(X_train_raw)

classifier.fit(X_train, y_train)

X_test = vectorizer.transform(X_test_raw)

y_hat = classifier.predict(X_test)

In [15]:
print(y_hat)

[1 0 1 1 0 0 0 0 1 1 1 0 1 1 0 1 1 1 0 1 0 1 1 0 1 1 0 1 1 0 0 0 0 0 0 1 0
 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 1
 0 1 1 0 0 1 0 0 0 1 1 1 1 1 1 0 0 0 0 1 0 0 0 1 0 0 0 1 1 1 1 1 0 0 0 0 1
 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 1 0 1 1 0
 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 1 1 0 1 0 0 0 0 1 1
 1 0 0 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 0 0 0 0 1 1 1 1 0 1
 1 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 1 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1
 1 0 0 1 1 0 0 0 1 0 0 1 1 0]


In [16]:
classifier.score(X_test,y_test)

0.8131868131868132

In [17]:
from sklearn.metrics import confusion_matrix

tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()

print("tn=",tn,"fp=", fp,"fn=", fn,"tp=", tp)

confusion_matrix(y_test, y_hat)

tn= 124 fp= 15 fn= 36 tp= 98


array([[124,  15],
       [ 36,  98]], dtype=int64)

In [18]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.78      0.89      0.83       139
           1       0.87      0.73      0.79       134

    accuracy                           0.81       273
   macro avg       0.82      0.81      0.81       273
weighted avg       0.82      0.81      0.81       273



In [19]:
import warnings
warnings.filterwarnings("ignore")

#### imdb情感分析

In [1]:
from __future__ import print_function
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Convolution1D, GlobalMaxPooling1D
from keras.datasets import imdb

np.random.seed(1337)  # for reproducibility

max_features = 5000  
maxlen = 400         
batch_size = 32      
embedding_dims = 50  
nb_filter = 250      
filter_length = 3    
hidden_dims = 250    
nb_epoch = 60        

Using TensorFlow backend.


In [2]:
print('Loading data...')
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
print(X_train)
print(y_train)

Loading data...




[list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 2, 19, 178, 32])
 list([1, 194, 1153, 194, 2, 78, 228, 5, 6, 1463, 4369, 2, 134, 26, 

In [3]:
print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

Pad sequences (samples x time)
X_train shape: (25000, 400)
X_test shape: (25000, 400)


In [4]:
model = Sequential()

model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen,
                    dropout=0.2))


model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        border_mode='valid',
                        activation='relu',
                        subsample_length=1))






  
  del sys.path[0]


In [5]:
model.add(GlobalMaxPooling1D())


model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))


model.add(Dense(1))
model.add(Activation('sigmoid'))


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [6]:
model.summary()  

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 400, 50)           250000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 398, 250)          37750     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               62750     
_________________________________________________________________
dropout_1 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation_1 (Activation)    (None, 250)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 251       
__________

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          batch_size=batch_size,
          nb_epoch=60)



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


  import sys





Epoch 1/60





Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
 1888/25000 [=>............................] - ETA: 11s - loss: 2.7098e-04 - acc: 1.0000

In [None]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))