# 情感分析

## 1.导入

In [3]:
import pandas as pd

import nltk 
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import sentiwordnet as swn

import string

## 2.分词

#### 导入文本

In [4]:
text = 'Nice quality, fairly quiet, nice looking and not too big.  I bought two.'

#### 载入停用词

In [10]:
sw = stopwords.words("english") + list(string.punctuation)
print(sw)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [11]:
# 西班牙语的停用词
spanish_stopwords = stopwords.words("spanish")
print(spanish_stopwords)

['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'del', 'se', 'las', 'por', 'un', 'para', 'con', 'no', 'una', 'su', 'al', 'lo', 'como', 'más', 'pero', 'sus', 'le', 'ya', 'o', 'este', 'sí', 'porque', 'esta', 'entre', 'cuando', 'muy', 'sin', 'sobre', 'también', 'me', 'hasta', 'hay', 'donde', 'quien', 'desde', 'todo', 'nos', 'durante', 'todos', 'uno', 'les', 'ni', 'contra', 'otros', 'ese', 'eso', 'ante', 'ellos', 'e', 'esto', 'mí', 'antes', 'algunos', 'qué', 'unos', 'yo', 'otro', 'otras', 'otra', 'él', 'tanto', 'esa', 'estos', 'mucho', 'quienes', 'nada', 'muchos', 'cual', 'poco', 'ella', 'estar', 'estas', 'algunas', 'algo', 'nosotros', 'mi', 'mis', 'tú', 'te', 'ti', 'tu', 'tus', 'ellas', 'nosotras', 'vosostros', 'vosostras', 'os', 'mío', 'mía', 'míos', 'mías', 'tuyo', 'tuya', 'tuyos', 'tuyas', 'suyo', 'suya', 'suyos', 'suyas', 'nuestro', 'nuestra', 'nuestros', 'nuestras', 'vuestro', 'vuestra', 'vuestros', 'vuestras', 'esos', 'esas', 'estoy', 'estás', 'está', 'estamos', 'estáis', 'están', 

#### 分词

In [13]:
tokenize = [word for word in word_tokenize(str(text).lower()) if word not in sw]
print(tokenize)

['nice', 'quality', 'fairly', 'quiet', 'nice', 'looking', 'big', 'bought', 'two']


In [16]:
# 使用word_tokenize
print(word_tokenize(text))

['Nice', 'quality', ',', 'fairly', 'quiet', ',', 'nice', 'looking', 'and', 'not', 'too', 'big', '.', 'I', 'bought', 'two', '.']


## 3.计数、词性标签

#### 词性标签

In [18]:
postag = nltk.pos_tag(tokenize)
print(postag)

[('nice', 'JJ'), ('quality', 'NN'), ('fairly', 'RB'), ('quiet', 'JJ'), ('nice', 'JJ'), ('looking', 'VBG'), ('big', 'JJ'), ('bought', 'VBD'), ('two', 'CD')]


#### 词频

In [23]:
freq = nltk.FreqDist(postag)
print(list(freq))

[('nice', 'JJ'), ('quality', 'NN'), ('fairly', 'RB'), ('quiet', 'JJ'), ('looking', 'VBG'), ('big', 'JJ'), ('bought', 'VBD'), ('two', 'CD')]


#### 按照词频排序

In [25]:
word_list = freq.most_common()
print(word_list)

[(('nice', 'JJ'), 2), (('quality', 'NN'), 1), (('fairly', 'RB'), 1), (('quiet', 'JJ'), 1), (('looking', 'VBG'), 1), (('big', 'JJ'), 1), (('bought', 'VBD'), 1), (('two', 'CD'), 1)]


#### 查看pos_tag的意思

In [53]:
# nltk.help.upenn_tagset()

#### 存入DataFrame

In [44]:
word_, pos_, freq_ = [], [], []

for i in range(len(word_list)):
    word_.append(word_list[i][0][0])
    pos_.append(word_list[i][0][1])
    freq_.append(word_list[i][1])

df = pd.DataFrame({'word': word_, 'pos': pos_, 'freq': freq_}, columns=['word', 'pos', 'freq'])
df

Unnamed: 0,word,pos,freq
0,nice,JJ,2
1,quality,NN,1
2,fairly,RB,1
3,quiet,JJ,1
4,looking,VBG,1
5,big,JJ,1
6,bought,VBD,1
7,two,CD,1


## 计算单词得分

#### 函数解读

In [55]:
sentis = swn.senti_synsets("great")
print(list(sentis))

[SentiSynset('great.n.01'), SentiSynset('great.s.01'), SentiSynset('great.s.02'), SentiSynset('great.s.03'), SentiSynset('bang-up.s.01'), SentiSynset('capital.s.03'), SentiSynset('big.s.13')]


#### 单词得分

In [72]:
str(swn.senti_synset('great.s.02'))

'<great.s.02: PosScore=0.75 NegScore=0.0>'

In [80]:
# 积极得分
print(swn.senti_synset('great.s.02').pos_score())
# 消极得分
print(swn.senti_synset("awful.s.02").neg_score())

0.75
0.625


#### 编码转换

In [81]:
n = ['NN','NNP','NNPS','NNS','UH']
v = ['VB','VBD','VBG','VBN','VBP','VBZ']
a = ['JJ','JJR','JJS']
r = ['RB','RBR','RBS','RP','WRB']

In [83]:
for i in range(len(df['word'])):
    z = df.iloc[i,1]
    if z in n:
        df.iloc[i,1]='n'
    elif z in v:
        df.iloc[i,1]='v'
    elif z in a:
        df.iloc[i,1]='a'
    elif z in r:
        df.iloc[i,1]='r'
    else:
        df.iloc[i,1]=''

In [84]:
df

Unnamed: 0,word,pos,freq
0,nice,a,2
1,quality,n,1
2,fairly,r,1
3,quiet,a,1
4,looking,v,1
5,big,a,1
6,bought,v,1
7,two,,1


#### 计算总体得分

In [85]:
score = []
for i in range(len(df['word'])):
    # senti_synsets(word, pos)
    m = list(swn.senti_synsets(df.iloc[i,0], df.iloc[i,1]))
    s = 0
    ra = 0
    if len(m) > 0:
        for j in range(len(m)):
            s += (m[j].pos_score()-m[j].neg_score())/(j+1)
            ra += 1/(j+1)
        score.append(s/ra)
    else:
        score.append(0)
new_df = pd.concat([df, pd.DataFrame({'score':score})], axis=1)

In [94]:
new_df

Unnamed: 0,word,pos,freq,score
0,nice,a,2,0.708942
1,quality,n,1,0.35219
2,fairly,r,1,-0.034091
3,quiet,a,1,-0.218537
4,looking,v,1,0.012092
5,big,a,1,0.103294
6,bought,v,1,0.083942
7,two,,1,0.0


# 规整代码

In [157]:
import numpy as np
import pandas as pd

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import sentiwordnet as swn

import string

n = ['NN', 'NNP', 'NNPS', 'NNS', 'UH']
v = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
a = ['JJ', 'JJR', 'JJS']
r = ['RB', 'RBR', 'RBS', 'RP', 'WRB']


def calculate_sentiment(text):
    sw = stopwords.words("english") + list(string.punctuation)
    tokenize = [word for word in word_tokenize(str(text).lower()) if word not in sw]
    postag = nltk.pos_tag(tokenize)
    freq = nltk.FreqDist(postag)
    word_list = freq.most_common()

    word_, pos_, freq_ = [], [], []
    for i in range(len(word_list)):
        word_.append(word_list[i][0][0])
        pos_.append(word_list[i][0][1])
        freq_.append(word_list[i][1])
    df = pd.DataFrame({'word': word_, 'pos': pos_, 'freq': freq_}, columns=['word', 'pos', 'freq'])

    for i in range(len(df['word'])):
        z = df.iloc[i, 1]
        if z in n:
            df.iloc[i, 1] = 'n'
        elif z in v:
            df.iloc[i, 1] = 'v'
        elif z in a:
            df.iloc[i, 1] = 'a'
        elif z in r:
            df.iloc[i, 1] = 'r'
        else:
            df.iloc[i, 1] = ''

    score = []
    for i in range(len(df['word'])):
        m = list(swn.senti_synsets(df.iloc[i, 0], df.iloc[i, 1]))
        
        s = 0
        ra = 0
        if len(m) > 0:
            for j in range(len(m)):
                s += (m[j].pos_score() - m[j].neg_score()) / (j + 1)
                ra += 1 / (j + 1)
            score.append(s / ra)
        else:
            score.append(0)
    new_df = pd.concat([df, pd.DataFrame({'score': score})], axis=1)

    score_sum = np.sum(new_df["score"])
    if score_sum > 0:
        attitude = "positive"
    else:
        attitude = "negative"

    return attitude, score_sum

## 影评分析

In [186]:
imdb = [
    """It might have been an error on Disney's part to release only the first 2 episodes at once instead of 4 episodes. While I am on of the few people who absolutely loved the intrigue and mystery of the first three episodes, this exceptional show has been getting a lot of hate due to a lack of plot.
... And then the 4th episode came out. Not only was it an exceptional episode, but it justified the use of the first three and even gave some answers to the many questions of Wanda vision.
Overall a brilliant show that shouldn't be judged on the first three episodes (especially if your only complaint is a lack of plot)""",
    """Q1: Would you lie to the police and falsely accuse your best friend's father of child abuse so that your best friend could run away for a couple of days to see their SO?
    Q2: Would you throw your purse (assuming you have one) in a river and break your cell phone in order to pretend to be pushed off a bridge by your best friend to play a joke on your parents?
    Q3: Would you scream at your parents and yell you were a terrible person and you killed your best friend because of your Dad when you knew this was a prank all along?
    If your answers to all of these are "yes", this might be the movie for you.""",
    """Only real reason I'm writing this is to counter everyone else's reviews absurdly angry reviews. This was a decent story, well acted and no I didn't know where it was going. I went along with the ride. People are mad at the plot that the parents did stupid things... well people do stupid things. Isn't that the point? Also, if a movie inspired a reaction from you... isn't that also the point?"""
]

In [189]:
review = [calculate_sentiment(t)[0] for t in imdb]
print(review)

['positive', 'positive', 'positive']
