In [31]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from tqdm import tqdm

import numpy as np
import pandas as pd
import csv

# **1. NBC 수행**

## **1-1. 코퍼스 가져오기**

In [32]:
labeled_corpus = pd.read_csv('corpus.csv', index_col=[0], header = 0, encoding = 'utf-8')
corpus = labeled_corpus.copy()

In [33]:
corpus = corpus[corpus['token_ngram'].notna()]
corpus = corpus.dropna()
corpus

Unnamed: 0,token_ngram,label
0,"강진/NNG,송/NNG,기자/NNG,강진군/NNG,농가/NNG,소득/NNG,보전/N...",1
1,"단위/NNG,끝/NNG",1
2,"단위/NNG,끝/NNG",1
3,"단위/NNG,끝/NNG",1
4,"단위/NNG,끝/NNG",1
...,...,...
196445,"'국제/NNG,금융시장/NNG,안정/NNG,되/VV,지속/NNG,하/XSV,가운데/...",0
196446,"'성장/NNG,고용/NNG,회복국면/NNG,정점/NNG,향하/VV,불구/NNG,물가...",0
196447,"'금리정상화/NNG,행보/NNG,불구/NNG,미/NNG,달러/NNG,약세/NNG,기...",1
196448,"'임금/NNG,상승/NNG,확대/NNG,되/XSV,세제/NNG,개혁/NNG,기대/N...",0


In [34]:
# 토크나이저 함수
def my_tokenizer(x):
    return x.split(",")

## **1-2. 데이터 파이프 라인**

In [35]:
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), min_df = 15, tokenizer = my_tokenizer)),
                      ('clf', MultinomialNB(alpha=0.001))])
vect = text_clf.named_steps['vect']
clf = text_clf.named_steps['clf']

In [36]:
# 배깅 30회
accuracy = np.zeros(30)
posterior_list = []

for i in tqdm(range(30)):
    X_train, X_test, y_train, y_test = train_test_split(corpus['token_ngram'], corpus['label'], random_state = i, train_size = 0.9, shuffle=True)
    text_clf.fit(X_train, y_train)
    posterior_list.append(np.vstack([np.array(vect.get_feature_names()), np.exp(clf.feature_log_prob_)]))
    pred = text_clf.predict(X_test)
    accuracy[i] = np.sum(pred == y_test)

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [10:01<00:00, 20.04s/it]


In [37]:
clf.classes_ 

array([-1,  0,  1], dtype=int64)

In [52]:
pd.DataFrame(posterior_list)

Unnamed: 0,0
0,"[[ 'fed/nng, '가계/nng, '경상/nng, '경제/nng, '국..."
1,"[[ 'fed/nng, '가계/nng, '경기/nng, '경제/nng, '국..."
2,"[[ 'fed/nng, '가계/nng, '경상/nng, '경제/nng, '국..."
3,"[[ 'fed/nng, '가계/nng, '경제/nng, '국제/nng, '글..."
4,"[[ 'fed/nng, '가계/nng, '경제/nng, '국제/nng, '글..."
5,"[[ 'fed/nng, '가계/nng, '경기/nng, '경제/nng, '국..."
6,"[[ 'fed/nng, '가계/nng, '경제/nng, '국제/nng, '글..."
7,"[[ 'fed/nng, '가계/nng, '경제/nng, '국제/nng, '글..."
8,"[[ 'fed/nng, '가계/nng, '경제/nng, '국제/nng, '글..."
9,"[[ 'fed/nng, '가계/nng, '경상/nng, '경제/nng, '국..."


In [38]:
pd.DataFrame(posterior_list).to_csv('posterior_list_real.csv')

In [39]:
cv = CountVectorizer(ngram_range=(1,1), min_df = 15, tokenizer = my_tokenizer)

In [40]:
X = cv.fit_transform(corpus['token_ngram'])

In [41]:
ngram_list = cv.get_feature_names()
len(ngram_list)

40351

In [42]:
polarity_scores = np.zeros((30, len(ngram_list)))

for i, itr in tqdm(enumerate(posterior_list)):
    for idx, n_gram in enumerate(itr[0]):
        tmp_n = ngram_list.index(n_gram)
        p_score = float(itr[3][idx])/float(itr[1][idx])
        polarity_scores[i][tmp_n] = p_score
        
polarity_scores.shape

30it [09:42, 19.42s/it]


(30, 40351)

In [43]:
for i, v in enumerate(polarity_scores):
    for j, w in enumerate(v):
         if polarity_scores[i][j] == 0:
             polarity_scores[i][j] = None

In [44]:
df_p_scores = pd.DataFrame(polarity_scores)
df_p_scores = df_p_scores.fillna(df_p_scores.mean())

In [45]:
df_p_scores.T.isnull().sum()

0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    1
21    1
22    1
23    1
24    1
25    1
26    1
27    1
28    1
29    1
dtype: int64

In [46]:
avg_polarity_scores = list(df_p_scores.mean())

In [47]:
ps = pd.DataFrame(avg_polarity_scores, index=ngram_list)
ps

Unnamed: 0,0
'fed/nng,0.841599
'가계/nng,0.275854
'경기/nng,0.213615
'경상/nng,0.356412
'경제/nng,0.353738
...,...
龍/nng,0.435291
劉/nng,0.753047
柳/nng,1.042056
利/nng,0.535285


In [48]:
ps = ps[ps[0].apply(lambda x: x > 1.3 or x < 0.76)]
ps

Unnamed: 0,0
'가계/nng,0.275854
'경기/nng,0.213615
'경상/nng,0.356412
'경제/nng,0.353738
'글로벌/nng,1.759334
...,...
年/nng,1.545856
連/nng,0.551367
龍/nng,0.435291
劉/nng,0.753047


In [49]:
ps.to_csv('polarity_score_new.csv', encoding = 'utf-8')

In [50]:
ps[ps[0]<0.76]

Unnamed: 0,0
'가계/nng,0.275854
'경기/nng,0.213615
'경상/nng,0.356412
'경제/nng,0.353738
'금리/nng,0.752934
...,...
兩難/nng,0.190143
連/nng,0.551367
龍/nng,0.435291
劉/nng,0.753047


In [51]:
ps[ps[0]>1.3]

Unnamed: 0,0
'글로벌/nng,1.759334
'금융기관/nng,1.714611
'금융시장/nng,2.132278
'대해/vv,2.213274
'미/nng,4.256260
...,...
魯/nng,3.159873
不/nng,1.344120
女/nng,1.624824
女心/nng,2.487814
