## 데이터 다운로드

In [28]:
# 단어를 숫자로 표현하는 방법

import pandas as pd
import requests

In [29]:
# label 에 긍정 부정 나옴

res = requests.get('https://github.com/e9t/nsmc/raw/master/ratings_train.txt')
with open('ratings_train.txt', 'wb') as f:
    f.write(res.content)
    
nsmc = pd.read_csv('ratings_train.txt', sep='\t')

In [30]:
nsmc

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1
...,...,...,...
149995,6222902,인간이 문제지.. 소는 뭔죄인가..,0
149996,8549745,평점이 너무 낮아서...,1
149997,9311800,이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?,0
149998,2376369,청춘 영화의 최고봉.방황과 우울했던 날들의 자화상,1


## 전처리

In [31]:
# 특수문자 제외, 순수한글로만 학습

import re # 정규 표현식

In [32]:
# 한글만 찾아주는 함수
def find_hangul(text):
    return re.findall(r'[ㄱ-ㅎ가-힣]+', text) # 전부 찾아주겠다

In [33]:
nsmc.loc[0, 'document']

'아 더빙.. 진짜 짜증나네요 목소리'

In [34]:
# 함수에 태우기, 리스트로 한글만 찾아줌 
find_hangul(nsmc.loc[0, 'document'])

['아', '더빙', '진짜', '짜증나네요', '목소리']

In [35]:
# 비어 있는 행을 제외한 데이터에서 한글만 추출
data = nsmc[nsmc['document'].notnull()]['document'].map(find_hangul)  # .map(find_hangul) 전체를 파라미터로 넘김?(다시 질문)

In [36]:
data[0]

['아', '더빙', '진짜', '짜증나네요', '목소리']

In [37]:
data[1]

['흠', '포스터보고', '초딩영화줄', '오버연기조차', '가볍지', '않구나']

In [38]:
data[2]

['너무재밓었다그래서보는것을추천한다']

In [40]:
def only_hangul(text):
    return ' '.join(find_hangul(text))

In [41]:
data2=nsmc[nsmc['document'].notnull()]['document'].map(only_hangul)

In [44]:
data2[0]

'아 더빙 진짜 짜증나네요 목소리'

In [43]:
with open('nsmc.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(data2))

## FastText 모형학습
- 학습하지 않은 데이터도 처리됨

In [45]:
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText

In [46]:
# sb = 0(기본값), 1 = (cbow학습), s = 1 : skip-gram학습
# alpha = 0.025 : 학습률
# min_alpha : 학습률을 낮춰주는 역할
# window = 5 : 좌우 몇개 단어 맥락으로 학습을 할지 결정
# min_count = 5 : 최소 몇번 나와야 학습을 함
# vector_size = 100(기본값)

model = FastText(vector_size=16) # 단어를 16벡터로 표현

In [48]:
model.build_vocab(corpus_iterable=data)
# model.build_vocab(corpus_file='nsmc.txt') : gensim 4.0이하

In [49]:
model.train(corpus_iterable=data, # sentences=data : gensim 4.0 이하
            epochs=5,
            total_examples=model.corpus_count,
            total_words=model.corpus_total_words
)


(3999445, 5829395)

## 저장

In [50]:
model.save('nsmc.fasttext')

## 임베딩

In [51]:
model = FastText.load('nsmc.fasttext')

In [52]:
model.wv.key_to_index

{'영화': 0,
 '너무': 1,
 '정말': 2,
 '진짜': 3,
 '이': 4,
 '그냥': 5,
 '왜': 6,
 '이런': 7,
 '더': 8,
 '점': 9,
 '수': 10,
 '영화를': 11,
 '다': 12,
 '잘': 13,
 '좀': 14,
 '보고': 15,
 'ㅋㅋ': 16,
 '그': 17,
 '영화가': 18,
 '영화는': 19,
 '본': 20,
 '봤는데': 21,
 '최고의': 22,
 '아': 23,
 '이건': 24,
 '내가': 25,
 '드라마': 26,
 '없는': 27,
 '없다': 28,
 '평점': 29,
 '완전': 30,
 '이렇게': 31,
 '참': 32,
 '이거': 33,
 '그리고': 34,
 '이게': 35,
 '좋은': 36,
 '있는': 37,
 '연기': 38,
 '내': 39,
 '평점이': 40,
 '보는': 41,
 '최고': 42,
 '다시': 43,
 '역시': 44,
 '스토리': 45,
 '쓰레기': 46,
 'ㅋ': 47,
 '난': 48,
 '많이': 49,
 '것': 50,
 '한': 51,
 'ㅋㅋㅋ': 52,
 '재밌게': 53,
 '없고': 54,
 '또': 55,
 '하는': 56,
 '아깝다': 57,
 '꼭': 58,
 '보면': 59,
 '마지막': 60,
 '가장': 61,
 '뭐': 62,
 '영화다': 63,
 '무슨': 64,
 '하지만': 65,
 '같은': 66,
 'ㅎㅎ': 67,
 '와': 68,
 '별로': 69,
 '작품': 70,
 '솔직히': 71,
 '끝까지': 72,
 '볼': 73,
 '넘': 74,
 '안': 75,
 '대한': 76,
 '만든': 77,
 '봐도': 78,
 '그래도': 79,
 '시간': 80,
 '같다': 81,
 '전혀': 82,
 '좋다': 83,
 '말이': 84,
 '지금': 85,
 '별': 86,
 '아주': 87,
 '근데': 88,
 '중': 89,
 '뭔가': 90,
 '영화의': 91,
 '하

In [54]:
'히어로' in model.wv.key_to_index # '히어로' in model.wv.vocab : gensim 4


True

In [55]:
model.wv['히어로'] # 히어로 데이터값

array([-0.3293748 ,  0.6472046 ,  0.4723161 ,  0.39297798,  0.8490663 ,
        0.23978268, -0.6051463 , -0.5427327 ,  0.22190781,  0.0204047 ,
        0.24968304, -0.9598252 ,  0.32943487, -0.3359595 , -0.05462562,
        0.30594778], dtype=float32)

In [56]:
len(model.wv['히어로'])

16

In [57]:
'슈퍼히어로' in model.wv.key_to_index

False

In [58]:
model.wv['슈퍼히어로'] #  아까 신경망은 훈련안된거 안나오지만 위에서 없다는데도 나옴! 신기함! 단어쪼개서 임베딩함

array([-0.15016747,  0.263527  ,  0.21766675,  0.17075384,  0.33914503,
        0.11567994, -0.21819663, -0.22490896,  0.13793083, -0.00280931,
        0.14085056, -0.35491118,  0.05773573, -0.13819763,  0.0133127 ,
        0.17284584], dtype=float32)

## 유사도

In [59]:
model.wv.similarity('슈퍼히어로', '히어로') # 코사인 유사도로 비교

0.98625344

In [60]:
model.wv.similarity('히어로', '평론가')

0.6886091

In [61]:
model.wv.most_similar('평론가')

[('기자', 0.9896008968353271),
 ('평론', 0.9891468286514282),
 ('점대나', 0.9890568852424622),
 ('점대면', 0.9886201024055481),
 ('평론가들', 0.9882262945175171),
 ('높은거지', 0.9875937700271606),
 ('높은거야', 0.9871346950531006),
 ('점이나', 0.986896276473999),
 ('점대야', 0.9866729974746704),
 ('점대지', 0.9858719706535339)]

## FastText를 이용한 감성분석

In [62]:
# 랜덤으로 처리되서 사람마다 다름

from gensim.models.fasttext import FastText

ft=FastText.load('nsmc.fasttext')

In [63]:
import pandas as pd

nsmc = pd.read_csv('ratings_train.txt', sep='\t')

In [64]:
df = nsmc[nsmc['document'].notnull()]

In [71]:
from sklearn.model_selection import train_test_split
doc_train, doc_test, y_train, y_test = train_test_split(df['document'], df['label'], \
                                    test_size=0.2, random_state=42)

In [72]:
import re
def find_hangul(text):
    return re.findall(r'[ㄱ-ㅎ가-힣]+', text)

In [73]:
import numpy as np
x_train = np.zeros((1000, 16)) # 문서당 단어수가 많지만 1000행 문서만 훈련하겠다

In [74]:
doc_train.shape

(119996,)

In [75]:
doc_train

31989            아 꿀잼ㅋ 친구랑 봤는데 너무 웃겼음 그리구 김우빈 잘생겼다..
63462                    개건의 졸작 스릴러? 스릴러라고 하게에도 민망하군
17518               장하나 한윤찬 제발 이어주세요 말도안되게 왜 설도현과ㅜㅜㅜ
123410                                   애로영화계의 개OOO
104181                               내용이나 그래픽자체가 허접함
                             ...                    
119882      꿈을 꾸는사람 꿈을 이룬사람 돌멩이도 꿈은 있잖아! 꿈과희망을 주는 영화
103696                                레니 할린.. 이게 뭐니?
131936                                    시라노; 연애조작단
146872                                    집중이 쉽지 않다.
121961    엔딩 장면이 좋고 소소한 일상과 고민과 연애가 공감을 주어 재밌게 보았어여~
Name: document, Length: 119996, dtype: object

In [78]:
# 리스트로 나옴 
for i, doc in enumerate(doc_train.iloc[:1000]):
    vs = [ft.wv[word] for word in find_hangul(doc) if word in ft.wv]
    if vs:
     #  print(doc)
    #   print(len(vs))
    #   print(vs[0])
    
      x_train[i,] = np.mean(vs, axis=0)
    else:
        print(doc)
    
# 단어가 몇개든 무조건 16개가 나옴    
    
#         print(np.mean(vs, axis=0))



[-3.2981365   3.0196953   2.6244533  -0.3488196   2.1278129   1.8665718
 -2.6568594  -2.7412467   1.3882834  -3.511541    1.2159642   0.98036665
 -5.9492397  -1.3442037   4.4297814   4.1257944 ]
[-0.3203672   0.24632618  0.0615074   0.15749888  0.5448598   0.13149723
 -0.25773293 -0.34238738  0.30069637 -0.00687785  0.15812866 -0.22116783
 -0.22734948 -0.04331387  0.18603292  0.29693204]
[-1.4419272   0.8685541   0.86114293  0.5468082   1.5151123   0.8536354
 -1.05376    -0.39173445  0.9371071  -0.03009911  0.6328651  -1.6345738
 -0.50358945 -0.11379372  0.26218924  0.7438256 ]
[-5.338679   -1.786885   -0.42853144  2.64749     1.9185654   2.2847397
 -0.10350875 -1.2488115   1.9153739   0.48836857 -0.4079568  -2.2096488
 -6.6149893  -0.93416107  1.5475878   2.286835  ]
[-0.36037397  0.26849246 -2.3162248   2.7494369   1.6005285   1.7212219
 -0.68339825 -4.1136646   1.2596407   4.0408034   1.9567571  -0.74329895
 -3.7082868  -0.08981058 -0.01953527  0.91317725]
[-0.35003307  0.4344094   

 -0.53315794  0.86478424 -1.2807059   0.8815816 ]
[-0.84609956  1.2360853  -0.40737662  0.9257306   1.1495348   0.18687306
 -0.5004404  -1.7165326   0.34282514  0.542633    0.8559033  -1.9693073
  0.3650412  -0.7861878  -0.5411583   1.0046242 ]
[-0.7340768   1.3014904   1.4242672   0.06023373  0.45143232  1.405874
 -0.66967624 -0.0179894   0.10259011 -0.43370277  1.0018674  -2.8802056
  1.0831724   0.6198007  -0.8999411  -0.65453243]
[-0.14545104  0.43235034  0.11561061  0.26687104  0.3622039   0.07203285
 -0.11694826 -0.52587366  0.0846026   0.18616912  0.3194998  -0.48318544
  0.08142564 -0.18412186 -0.15133213  0.3392036 ]
[-1.1587396   0.06557127 -0.30263606  1.576485    2.7967281   0.42888078
 -1.5856344  -1.1973213   0.15705442 -0.11332049 -0.33761188 -2.1242964
  0.12662122 -0.8532169  -0.45405635 -1.4706864 ]
[-0.37725168  0.3875668  -0.12849076  0.39860985  0.82957286  0.09388467
 -0.36672902 -0.6200269   0.37975317  0.06287616  0.18607996 -0.46127352
 -0.09409027 -0.26640958 

 -0.10906856 -0.09135447 -0.26118866 -0.09184572]
[-0.44902298 -0.05563342  0.84182245  1.3621248   2.879777    0.02388565
 -0.39587715 -1.1830546   1.1364619   1.082776    0.22490202 -1.2749728
 -0.49668318  0.30415627 -0.6637024  -0.35275045]
[-0.68186843  0.7360943  -0.9187493   1.6768069   2.8107436   0.45989555
 -1.5648293  -1.3686917   1.2680837   1.7786701  -0.06328268 -1.2857891
 -0.6516858  -0.9399097  -1.3000253  -0.45118946]
[-0.50415003  0.50409985  0.07332518  0.625376    1.083025    0.1938602
 -0.44225463 -0.75340915  0.4515568   0.2582619   0.4212385  -1.0510476
  0.10523111 -0.27547953 -0.10334941  0.16663337]
[-1.9301771  -0.42235526 -1.0646877   1.9764131   3.3722816  -1.2168921
 -5.4171333  -1.4867923  -0.37003237  2.0570452  -0.6214333  -4.5495005
  1.4930294  -0.314249   -1.1328648  -2.269029  ]
[-1.8677789  -0.73262304 -0.6598654   1.7203884   2.9059653  -0.32303518
 -2.0267756  -2.2789197   0.596216    0.94886094  0.25384036 -2.0677671
 -1.5785486  -0.48676062 -1

  0.2517518  -0.47533306 -0.22367501  0.2201345 ]
[-0.28370902  0.76216424  0.20441501  0.32414395  0.76615906  0.22112039
 -0.37906256 -0.8681382   0.33731708  0.18644965  0.49139497 -0.9516457
  0.08152144 -0.3025035  -0.06104279  0.52192444]
[-0.19919947  0.49967322  0.3987949   0.26709822  0.34489346  0.15463725
 -0.17595373 -0.393411    0.10184412  0.08850301  0.36921424 -0.57321846
  0.148064   -0.23706904 -0.09033536  0.43796244]
[-1.9109432  -0.9692945   2.1343637   3.2817144   1.5880516   0.80048794
 -1.23674     3.0130856  -2.2499504   1.5241287   2.3272777  -5.7637286
  4.387863   -2.4198353  -3.269946   -0.28205207]
[-0.8317674   4.674933    3.6913779   0.3279443   2.051477    0.21251604
  0.48544642 -0.39756837 -2.2598732  -2.6884933   0.45772907 -0.09155751
  0.81898445 -1.565303    0.37575778  3.6041305 ]
[ 0.64821255  4.5488286   5.3738413  -2.9954503   1.8954847  -0.5848423
 -0.9790635  -3.3533602   0.41870505  0.86564386 -7.001972   -6.6598854
 -0.9149661  -0.7063001 

 -0.15031067 -0.1370054  -0.01159928  0.5157281 ]
[-0.07571331  0.10605272  0.0626876   0.05759674  0.08523337  0.05216407
 -0.04423837 -0.12186389  0.02848823 -0.02544419  0.09793545 -0.09989167
 -0.0153425  -0.04268244 -0.02038741  0.08970453]
[-2.0154328   0.27402627  0.03607707  0.81241     0.80647904  0.73037314
  0.5154373  -1.6607277   1.1343368   0.0337675   1.0365884  -0.9599108
 -2.4681203  -0.03249687  0.39416373  1.1059288 ]
[-0.3587114   0.70053    -0.05946081  0.17758836 -0.04034948  0.21239558
  0.24524845 -0.63577384  0.03782627 -0.07235924  0.3147972  -0.3472165
  0.13847882 -0.18371779 -0.32036206  0.5875808 ]
[-1.0263741   1.9459571  -2.0641558   1.3376914   1.2142903   0.1329466
  0.19987158 -2.6650147   1.3990754   1.2465681  -0.0849231  -1.4368747
  0.2926006  -0.06356167 -0.30747268  2.0994086 ]
[-1.1151186   0.7648695  -0.00215631  0.6301865   1.008257    0.50513834
 -0.18992974 -1.0379044   0.7321892   0.48144796  0.45182756 -1.1401298
 -0.45892107  0.08696517 

 -0.20826939 -0.04024541  0.14396112  0.32341108]
[-0.43143755  1.6147565   1.6037579  -0.09113815  1.7983342   0.37024117
 -1.163067   -1.4791418   0.58020943  0.10093655 -0.9585334  -1.3189577
 -0.35530213 -0.6605422   0.29755706  0.81651604]
[-3.0305974   0.68933034  2.8032427   0.49534628  2.2747765   1.4748496
 -2.1417725   1.3530599   1.6067848  -1.4806565   0.07869322 -2.5065854
 -0.90000916  1.2558602   0.78662086 -0.03002081]
[-1.0555294   0.45091805  1.1220297   0.32860783  0.7999526   0.6514489
 -0.46936557 -0.26449746  0.4027714  -0.47052243  0.83311033 -1.0448096
 -0.4003985  -0.2907595  -0.1912621   0.44521686]
[-0.9289274   0.8494854   0.09527906  0.8985168   0.87115127  0.22752602
 -0.09662868 -0.8715113   0.50151247 -0.43438894  0.24250677 -1.0878922
  0.79725254 -0.5167411  -0.13104995  0.28568965]
[-0.5943556   0.4104886   0.4340172   0.26538685  0.6567327   0.31322813
 -0.514147   -0.14855342  0.38222098 -0.07028941  0.11439914 -0.7018983
 -0.13406661 -0.05114073  0

  2.4935925  -1.3996838  -1.6996782  -0.19731203]
[-0.41629168  0.47605607  0.3448856   0.25195745  0.4239251   0.3253314
 -0.43254226 -0.4691383   0.20860915 -0.06804335  0.48886338 -1.1144733
  0.27650505 -0.2891152  -0.18823664  0.2247568 ]
[-4.473984   -1.2426932   2.1470275  -0.05965114  1.13131     0.21512818
 -3.4821646   1.3800218   2.9245112  -1.6842885   0.94791454 -4.6241727
 -1.119054    0.17829317  1.0335908  -1.3301705 ]
[-0.3455184   0.33598143  0.3508473   0.17639008  0.397022    0.23027745
 -0.46232185 -0.20924696 -0.0220453  -0.16689208  0.61591274 -1.2554245
  0.5802363  -0.282852   -0.54131436 -0.21378192]
[-1.1337427   0.5595115   1.6362598   0.2651816   0.4650183   1.2457073
 -0.22668265 -0.56022334  0.14228912 -1.0863922   2.2300644  -2.8783033
  1.0352297  -0.5969904  -1.9000223  -0.65549755]
[-0.5333509   0.2911323   0.00251764  0.25199404  0.18405919  0.11515527
  0.01245767 -0.3147955  -0.07794899 -0.08840845  0.6532913  -0.7581231
  0.14066206 -0.2701703  -0

 -0.14425226 -0.12626614  0.00846231  0.19046819]
[-0.7591779   0.6765374   1.0546452   0.7073437   1.0266619   0.6305677
 -0.3265343  -0.54031074 -0.8449717  -0.8690326   1.196834   -1.4559423
  0.15097946 -0.8725558  -1.3641671   0.33306828]
[-0.15250047  0.35025015  0.21826418  0.08572859  0.26252237  0.14149393
 -0.12647235 -0.36884114  0.13729985 -0.04229662  0.21762216 -0.5730429
  0.10515466 -0.12012693 -0.14090748  0.2047774 ]
[-0.91343355  0.4789811   0.6468898   0.38650867  0.86204     0.3893492
 -0.7893119  -0.0736426   0.6254085  -0.05234176  0.41499373 -1.3959018
  0.03023717 -0.01998136 -0.02253931  0.25561616]
[-0.70180506  0.7209696   0.32352114  0.7354652   1.1386395   0.37212572
 -0.3544842  -1.2233839   0.413163    0.2947238   1.1142967  -1.444689
 -0.1193918  -0.13655281 -0.65219057  0.31390288]
[-7.221841   -1.9919595   0.6118196   0.5094126   0.718704    3.32822
 -2.7209811   2.5667052  -0.9878603  -1.1785092   3.2603474  -5.816411
 -4.441129    0.10823017  1.5967