# NLP Basic Assignment
## 과제 : spam.csv를 활용하여 유의미한 해석을 도출해주세요!

In [1]:
import pandas as pd

## Load Data
- 보시면 아시다시피 spam.csv는 라벨이 있는 데이터입니다.
- 7주차 주제가 텍스트 기초인만큼 텍스트만 활용하셔도 되고 라벨까지 활용하셔서 모델을 돌려보셔도 좋습니다.

In [2]:
spam = pd.read_csv('spam.csv')

In [3]:
spam.iloc[5]['v2']

"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"

In [4]:
spam.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
spam.shape

(5572, 2)

In [6]:
spam.isnull().sum()

v1    0
v2    0
dtype: int64

## Tokenizing


In [7]:
import nltk

In [8]:
# 예시 코드 코드
from nltk.tokenize import word_tokenize

nltk.download('punkt')
word_tokenize(spam.iloc[5]['v2'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hites\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['FreeMsg',
 'Hey',
 'there',
 'darling',
 'it',
 "'s",
 'been',
 '3',
 'week',
 "'s",
 'now',
 'and',
 'no',
 'word',
 'back',
 '!',
 'I',
 "'d",
 'like',
 'some',
 'fun',
 'you',
 'up',
 'for',
 'it',
 'still',
 '?',
 'Tb',
 'ok',
 '!',
 'XxX',
 'std',
 'chgs',
 'to',
 'send',
 ',',
 'å£1.50',
 'to',
 'rcv']

In [9]:
spam['v2_token'] = spam['v2'].apply(lambda x : word_tokenize(x))

In [10]:
spam.head()

Unnamed: 0,v1,v2,v2_token
0,ham,"Go until jurong point, crazy.. Available only ...","[Go, until, jurong, point, ,, crazy, .., Avail..."
1,ham,Ok lar... Joking wif u oni...,"[Ok, lar, ..., Joking, wif, u, oni, ...]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, in, 2, a, wkly, comp, to, win, F..."
3,ham,U dun say so early hor... U c already then say...,"[U, dun, say, so, early, hor, ..., U, c, alrea..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[Nah, I, do, n't, think, he, goes, to, usf, ,,..."


## Embedding

- 수업에서 다룬 임베딩 방법에는 One-hot encoding, CBOW, Skip-gram 등이 있었습니다. 다양한 시도와 '비교' 결과를 함께 적어주세요! 파라미터를 조정해가는 과정도 해석에 도움이 될 수 있겠죠 :)

In [11]:
# 정상 메일
ham_mail = spam.query('v1=="ham"')
# 스팸 메일
spam_mail = spam.query('v1=="spam"')

In [12]:
ham_mail

Unnamed: 0,v1,v2,v2_token
0,ham,"Go until jurong point, crazy.. Available only ...","[Go, until, jurong, point, ,, crazy, .., Avail..."
1,ham,Ok lar... Joking wif u oni...,"[Ok, lar, ..., Joking, wif, u, oni, ...]"
3,ham,U dun say so early hor... U c already then say...,"[U, dun, say, so, early, hor, ..., U, c, alrea..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[Nah, I, do, n't, think, he, goes, to, usf, ,,..."
6,ham,Even my brother is not like to speak with me. ...,"[Even, my, brother, is, not, like, to, speak, ..."
...,...,...,...
5565,ham,Huh y lei...,"[Huh, y, lei, ...]"
5568,ham,Will Ì_ b going to esplanade fr home?,"[Will, Ì_, b, going, to, esplanade, fr, home, ?]"
5569,ham,"Pity, * was in mood for that. So...any other s...","[Pity, ,, *, was, in, mood, for, that, ., So, ..."
5570,ham,The guy did some bitching but I acted like i'd...,"[The, guy, did, some, bitching, but, I, acted,..."


In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
count_vec = CountVectorizer(max_features = 1000, stop_words = 'english')

In [15]:
ham_count = count_vec.fit_transform(ham_mail['v2_token'].sum())

In [16]:
ham_df = pd.DataFrame(ham_count.sum(axis=0))

In [17]:
ham_df.columns = count_vec.get_feature_names()

In [18]:
ham_df = ham_df.T

In [19]:
ham_df = ham_df.reset_index()

In [20]:
ham_df.columns = ['word','count']

In [21]:
ham_df

Unnamed: 0,word,count
0,10,13
1,12,6
2,1st,12
3,2nd,10
4,2nite,6
...,...,...
995,yup,43
996,ì_,120
997,ìï,53
998,û_,15


In [22]:
count_vec = CountVectorizer(max_features = 1000, stop_words = 'english')

In [23]:
spam_count = count_vec.fit_transform(spam_mail['v2_token'].sum())

In [24]:
spam_df = pd.DataFrame(spam_count.sum(axis=0))

In [25]:
spam_df.columns = count_vec.get_feature_names()

In [26]:
spam_df = spam_df.T

In [27]:
spam_df = spam_df.reset_index()

In [28]:
spam_df.columns = ['word','count']

In [29]:
spam_df

Unnamed: 0,word,count
0,00,10
1,000,29
2,02,8
3,0207,3
4,03,13
...,...,...
995,yesterday,3
996,yo,3
997,yr,11
998,yrs,3


In [30]:
ham_df = ham_df.sort_values(by='count', ascending =False).reset_index(drop=True)

In [31]:
spam_df = spam_df.sort_values(by='count', ascending =False).reset_index(drop=True)

In [32]:
display(ham_df.head())
display(spam_df.head())

Unnamed: 0,word,count
0,gt,318
1,lt,316
2,just,293
3,ok,287
4,ll,265


Unnamed: 0,word,count
0,free,224
1,txt,163
2,ur,144
3,mobile,127
4,text,125


## 본인이 도출해낸 해석을 적어주세요!

- 유사도, Wordcloud, 이진 분류 모델, Plot 뭐든 상관없으니 분명하고 인상적인 해석을 적어주시면 됩니다.

In [35]:
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import numpy as np

### Ham

In [36]:
result_data = np.array(ham_mail['v2_token'].sum()).reshape(1,-1)
id2ham = corpora.Dictionary(result_data)

In [37]:
corpus_ham = [id2ham.doc2bow(text) for text in result_data]

In [38]:
corpus_ham

[[(0, 839),
  (1, 281),
  (2, 19),
  (3, 7),
  (4, 738),
  (5, 37),
  (6, 269),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 2),
  (14, 3),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 2),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 25),
  (23, 1),
  (24, 1),
  (25, 2),
  (26, 1),
  (27, 228),
  (28, 387),
  (29, 86),
  (30, 1),
  (31, 421),
  (32, 1),
  (33, 2),
  (34, 3),
  (35, 81),
  (36, 1),
  (37, 64),
  (38, 429),
  (39, 196),
  (40, 11),
  (41, 1),
  (42, 1),
  (43, 1500),
  (44, 1),
  (45, 172),
  (46, 9),
  (47, 1),
  (48, 4),
  (49, 1),
  (50, 1),
  (51, 4),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 3860),
  (61, 669),
  (62, 1124),
  (63, 51),
  (64, 9),
  (65, 1),
  (66, 2),
  (67, 2),
  (68, 3),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 2),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 2),
  (86, 9),
  (87, 1),
  (

In [39]:
TOPICS_W_NUM =20 # 출력할 토픽별 단어의 개수
save_lda_model=0
RANDOM_STATE = 2020
UPDATE_EVERY = 1
CHUNKSIZE = 100
PASSES = 10
ALPHA = 'auto'
PER_WORD_TOPICS = True
print('NUM_TOPICS', 'perplexity', 'coherence')
for i in range(1,10):
    NUM_TOPICS=i
 
  #해당 셀은 토픽모델링(LDA)에 대해 모델을 정의하는 셀입니다.
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_ham, id2word=id2ham, 
                                              num_topics=NUM_TOPICS, random_state=RANDOM_STATE, 
                                              update_every=UPDATE_EVERY, chunksize=CHUNKSIZE,
                                              passes=PASSES, alpha=ALPHA, per_word_topics=PER_WORD_TOPICS)

    # 토픽 출력
    doc_lda = lda_model[corpus_ham]


    # Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=result_data, dictionary=id2ham, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()

    print('T',NUM_TOPICS, lda_model.log_perplexity(corpus_ham), coherence_lda)

NUM_TOPICS perplexity coherence
T 1 -6.9542875888187785 0.20255198233349936
T 2 -6.9296130257683215 0.20310787465747304
T 3 -6.9266167969208565 0.20528039159182998
T 4 -6.935527326219968 0.2023723395699589
T 5 -6.9434416555784155 0.20318078111631074
T 6 -6.953687946649682 0.20285060423691684
T 7 -6.970718734434912 0.20248267564376116
T 8 -6.970993384607857 0.1993792685747656
T 9 -6.9798394328499045 0.20095198443982704


Perplexity와 Coherence 점수를 보고 좋은 Topic의 개수를 선정

In [41]:
# 토픽별 키워드 조회
NUM_TOPICS=5

for topic_id in range(NUM_TOPICS):
    topic_word_probs = lda_model.show_topic(topic_id, TOPICS_W_NUM)
    print("Topic ID: {}".format(topic_id))

    for topic_word, prob in topic_word_probs:
        print("\t{}\t{}".format(topic_word, prob))
    print("\n")

Topic ID: 0
	.	0.0009410875500179827
	I	0.000535059254616499
	you	0.0005026390426792204
	?	0.0004873204743489623
	,	0.0004273674276191741
	to	0.00039650703547522426
	and	0.0003890003135893494
	in	0.000360999460099265
	...	0.0003444430185481906
	i	0.0003408808261156082
	a	0.00031898138695396483
	&	0.0003157963219564408
	the	0.00029396312311291695
	for	0.00028716627275571227
	u	0.0002849106094799936
	:	0.00028018123703077435
	!	0.00027788602164946496
	is	0.00027532095555216074
	..	0.0002715383016038686
	it	0.0002648406371008605


Topic ID: 1
	.	0.002517964458093047
	I	0.0015514683909714222
	you	0.0012911520898342133
	to	0.0008777781622484326
	?	0.0008322678622789681
	,	0.0008081795531325042
	the	0.0007871208945289254
	and	0.000663395447190851
	;	0.0006555678555741906
	i	0.0006438016425818205
	me	0.0006433372036553919
	...	0.0006372539792209864
	a	0.0006132732960395515
	in	0.0005837975768372416
	..	0.0005771418800577521
	!	0.0005593243986368179
	u	0.0005584429018199444
	it	0.0005285165971

stopword를 제대로 제거하고 재 분석 필요

### Spam

In [42]:
result_data_spam = np.array(spam_mail['v2_token'].sum()).reshape(1,-1)
id2spam = corpora.Dictionary(result_data_spam)

In [43]:
corpus_spam = [id2spam.doc2bow(text) for text in result_data_spam]

In [45]:
TOPICS_W_NUM =20 # 출력할 토픽별 단어의 개수
save_lda_model=0
RANDOM_STATE = 2020
UPDATE_EVERY = 1
CHUNKSIZE = 100
PASSES = 10
ALPHA = 'auto'
PER_WORD_TOPICS = True
print('NUM_TOPICS', 'perplexity', 'coherence')
for i in range(1,10):
    NUM_TOPICS=i
 
  #해당 셀은 토픽모델링(LDA)에 대해 모델을 정의하는 셀입니다.
    lda_model_spam = gensim.models.ldamodel.LdaModel(corpus=corpus_spam, id2word=id2spam, 
                                              num_topics=NUM_TOPICS, random_state=RANDOM_STATE, 
                                              update_every=UPDATE_EVERY, chunksize=CHUNKSIZE,
                                              passes=PASSES, alpha=ALPHA, per_word_topics=PER_WORD_TOPICS)

    # 토픽 출력
    doc_lda_spam = lda_model_spam[corpus_spam]


    # Coherence Score
    coherence_model_lda_spam = CoherenceModel(model=lda_model_spam, texts=result_data_spam, dictionary=id2spam, coherence='c_v')
    coherence_lda_spam = coherence_model_lda_spam.get_coherence()

    print('T',NUM_TOPICS, lda_model_spam.log_perplexity(corpus_spam), coherence_lda_spam)

NUM_TOPICS perplexity coherence
T 1 -6.9543381198464695 0.21176875101937548
T 2 -6.9466305267676365 0.19865207353681807
T 3 -6.9651741364948005 0.20517837389419724
T 4 -6.9831721849398045 0.20615321001138967
T 5 -7.001453058846082 0.20531733719266115
T 6 -7.019337903969803 0.20648911491437016
T 7 -7.034979988525161 0.20581227666432259
T 8 -7.050709301512934 0.20667035403722234
T 9 -7.064538276964722 0.20770034914441346


1개일 때 일관성이 높긴 하지만 1개로 분류할 수는 없으므로 제외하고 생각  
Spam 메일은 토픽이 많아질수록 일관성이 높아지고 perplexity가 좋아지는 것을 알 수 있다. 이러한 특징을 사용하면 spam 메일을 잘 분류해 낼 수 있을듯

In [46]:
# 토픽별 키워드 조회
NUM_TOPICS=9

for topic_id in range(NUM_TOPICS):
    topic_word_probs_spam = lda_model_spam.show_topic(topic_id, TOPICS_W_NUM)
    print("Topic ID: {}".format(topic_id))

    for topic_word, prob in topic_word_probs_spam:
        print("\t{}\t{}".format(topic_word, prob))
    print("\n")

Topic ID: 0
	.	0.001197473960928619
	to	0.0008268457022495568
	a	0.0007765119080431759
	!	0.000699689902830869
	,	0.0006819021073170006
	call	0.0005265895742923021
	?	0.0005014721537008882
	&	0.0004968015127815306
	the	0.0004902182263322175
	your	0.0004770945815835148
	you	0.00046664191177114844
	4	0.00045870381291024387
	for	0.0004484019009396434
	:	0.0004470675194170326
	or	0.00044654347584582865
	2	0.00044408891699276865
	on	0.0004342120955698192
	Call	0.0004279307322576642
	is	0.0004235528758727014
	FREE	0.00041884180973283947


Topic ID: 1
	.	0.0029923743568360806
	!	0.0018205606611445546
	to	0.0017138643888756633
	a	0.0013338455464690924
	,	0.0012016543187201023
	you	0.000817951513454318
	for	0.0007939732167869806
	2	0.0007702006259933114
	?	0.000746460456866771
	or	0.0007203758577816188
	the	0.0007047388353385031
	call	0.0006807244499213994
	&	0.0006715873023495078
	have	0.0006624280358664691
	:	0.000625104526989162
	is	0.000616928213275969
	Call	0.0005991712096147239
	now	0.000

* Perplexity(혼란도) : 확률 모델이 결과를 얼마나 정확하게 예측하는지.낮을수록 정확하게 예측
* Coherence Score : 토픽이 얼마나 의미론적으로 일관성 있는지, 높을수록 의미론적 일관성 높음

물론 여기도 stopword를 제거하고 재분석이 필요해보인다.