In [1]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
raw_text = pd.read_csv('emotions_train.txt', delimiter=';', header=None, names=['sentence','emotion'])
train_data = raw_text['sentence']
train_emotion = raw_text['emotion']

In [3]:
raw_text

Unnamed: 0,sentence,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
15995,i just had a very brief time in the beanbag an...,sadness
15996,i am now turning and i feel pathetic that i am...,sadness
15997,i feel strong and good overall,joy
15998,i feel like this was such a rude comment and i...,anger


In [4]:
train_data

0                                  i didnt feel humiliated
1        i can go from feeling so hopeless to so damned...
2         im grabbing a minute to post i feel greedy wrong
3        i am ever feeling nostalgic about the fireplac...
4                                     i am feeling grouchy
                               ...                        
15995    i just had a very brief time in the beanbag an...
15996    i am now turning and i feel pathetic that i am...
15997                       i feel strong and good overall
15998    i feel like this was such a rude comment and i...
15999    i know a lot but i feel so stupid because i ca...
Name: sentence, Length: 16000, dtype: object

In [5]:
train_emotion

0        sadness
1        sadness
2          anger
3           love
4          anger
          ...   
15995    sadness
15996    sadness
15997        joy
15998      anger
15999    sadness
Name: emotion, Length: 16000, dtype: object

---

### scikit-learn 나이브 베이즈 감정 분석

In [6]:
cv = CountVectorizer()
transformed_text = cv.fit_transform(train_data)

In [7]:
clf = MultinomialNB()
clf.fit(transformed_text, train_emotion)

MultinomialNB()

In [8]:
test_data = ['i am curious', 'i feel gloomy and tired', 'i feel more creative', 'i feel a little mellow today']

In [9]:
doc_vector = cv.transform(test_data)
test_result = clf.predict(doc_vector)
print(test_result)

['surprise' 'sadness' 'joy' 'joy']


---

###  cal_partial_freq
 텍스트 데이터(texts)와 특정 감정(emotion)을 매개변수로 가지며, 해당 감정을 나타내는 문서를 filtered_texts에 저장

In [10]:
def cal_partial_freq(texts, emotion):
    partial_freq = dict()
    filtered_texts = texts[texts['emotion']==emotion]
    filtered_texts = filtered_texts['sentence']
    
    # 전체 데이터 내 각 단어별 빈도수를 입력해 주는 부분
    for sent in filtered_texts:
        # words = sent.rstrip().split()
        # for word in words:
        for word in sent.rstrip().split():
            if word not in partial_freq:
                partial_freq[word] = 1
            else:
                partial_freq[word] += 1
    
    return partial_freq

### cal_total_freq 
partial_freq 딕셔너리를 입력받아, 특정 감정별 문서 내 전체 단어의 빈도수를 계산하여 반환

In [11]:
def cal_total_freq(partial_freq):
    total = 0
    # partial_freq 딕셔너리에서 감정별로 문서 내 전체 단어의 빈도수를 계산하여 반환하는 부분 
    for word, freq in partial_freq.items():
        total += freq
    
    return total

In [12]:
# Emotions dataset for NLP 
data = pd.read_csv("emotions_train.txt", delimiter=';', header=None, names=['sentence','emotion'])

# happy가 joy라는 감정을 표현하는 문장에서 발생할 가능도 
joy_counter = cal_partial_freq(data, "joy")
joy_likelihood = joy_counter['happy'] / cal_total_freq(joy_counter)
print(joy_likelihood)

# happy가 sadness라는 감정을 표현하는 문장에서 발생할 가능도 
sad_counter =  cal_partial_freq(data, "sadness")
sad_likelihood = sad_counter['happy'] / cal_total_freq(sad_counter)
print(sad_likelihood)

# can이 surprise라는 감정을 표현하는 문장에서 발생할 가능도 
sup_counter =  cal_partial_freq(data, "surprise")
sup_likelihood = sup_counter['can'] / cal_total_freq(sup_counter)
print(sup_likelihood)

0.001415604166467398
0.0005952589376378725
0.002363652280486737


### cal_prior_prob
입력되는 data 내 특정 감정의 로그 발생 확률 을 반환해 주는 함수

In [13]:
def cal_prior_prob(data, emotion):
    filtered_texts = data[data['emotion'] == emotion]
    # data 내 특정 감정의 로그발생 확률을 반환하는 부분 
    
    return np.log(len(filtered_texts)/ len(data))

### predict_emotion
매개변수 data를 학습 데이터로 사용하여 sent의 각 감정별 로그 확률을 계산해 주는 함수

* 감정별 로그 확률 계산을 위해 단어의 로그 가능도를 사용
* 스무딩 값을 10으로 설정

In [14]:
def predict_emotion(sent, data):
    emotions = ['anger', 'love', 'sadness', 'fear', 'joy', 'surprise']
    predictions = []
    train_txt = pd.read_csv(data, delimiter=';', header=None, names=['sentence', 'emotion'])

    # sent의 각 감정별 로그 확률을 predictions 리스트에 저장 
    for emotion in emotions:
        prob = 0
        for word in sent.split():
            emotion_counter = cal_partial_freq(train_txt, emotion)
            prob += np.log((emotion_counter[word] + 10) / (cal_total_freq(emotion_counter) + 10))
        prob += cal_prior_prob(train_txt,emotion)
        predictions.append((emotion,prob))
    predictions.sort(key = lambda a : a[1])
    return predictions[-1]

* (감정, 확률)의 형태로 predictions 리스트에 저장
* 확률값이 가장 높은 (감정, 확률) 튜플을 반환

In [15]:
# 문장의 예측된 감정을 확인
test_sent = "i really want to go and enjoy this party"
predicted = predict_emotion(test_sent, "emotions_train.txt")
print(predicted)

('surprise', -49.413280143234715)
