In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

### 경고창 무시

In [2]:
import warnings

warnings.filterwarnings(action='ignore')

### corpus 지정

In [3]:
corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',    
]

### 분석 모델 정의

In [4]:
cv = CountVectorizer()
tfidf = TfidfVectorizer()

### DTM (Document-Term Matrix) - CountVectorizer()

In [5]:
print(cv.fit_transform(corpus).toarray())

[[0 1 0 1 0 1 0 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0]]


### 사전의 인덱스와 이름을 확인 가능 - CountVectorizer()

In [6]:
print(cv.vocabulary_)

{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


### DTM (Document-Term Matrix) - TfidfVectorizer()

In [7]:
print(tfidf.fit_transform(corpus).toarray())

[[0.         0.46735098 0.         0.46735098 0.         0.46735098
  0.         0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.        ]]


### 사전의 인덱스와 이름을 확인 가능 - TfidfVectorizer()

In [8]:
print(tfidf.vocabulary_)

{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


### transform 결과를 역으로 추출

In [9]:
cv.inverse_transform(cv.fit_transform(corpus).toarray())

[array(['know', 'love', 'want', 'you', 'your'], dtype='<U6'),
 array(['like', 'you'], dtype='<U6'),
 array(['do', 'should', 'what'], dtype='<U6')]

In [10]:
cv.get_feature_names()

['do', 'know', 'like', 'love', 'should', 'want', 'what', 'you', 'your']

In [11]:
for i, value in enumerate(cv.get_feature_names()):
    print(i, ': ', value)

0 :  do
1 :  know
2 :  like
3 :  love
4 :  should
5 :  want
6 :  what
7 :  you
8 :  your


### 새로운 문장을 넣어서 어떤 단어들이 들어갔는지 판단해보자!

In [12]:
sentence = {'i like like smile want'}

In [13]:
cv.transform(sentence).toarray()

array([[0, 0, 2, 0, 0, 1, 0, 0, 0]], dtype=int64)

### 단어 사전 생성

In [14]:
spam_dict = ['advertise','promotion','sales','hu','special','sale','member','news','buy','big']
ham_dict = ['order','confirm','agree','check','customer','payment','send','genetal','company','tour']

In [15]:
cv1 = CountVectorizer()
tfidf1 = TfidfVectorizer()
cv2 = CountVectorizer()
tfidf2 = TfidfVectorizer()

In [16]:
# spam 사전 만들기
cv1.fit(spam_dict)
tfidf1.fit(spam_dict)

TfidfVectorizer()

In [17]:
# ham 사전 만들기
cv2.fit(ham_dict)
tfidf2.fit(ham_dict)

TfidfVectorizer()

In [18]:
cv1.vocabulary_

{'advertise': 0,
 'promotion': 6,
 'sales': 8,
 'hu': 3,
 'special': 9,
 'sale': 7,
 'member': 4,
 'news': 5,
 'buy': 2,
 'big': 1}

In [19]:
cv2.vocabulary_

{'order': 6,
 'confirm': 3,
 'agree': 0,
 'check': 1,
 'customer': 4,
 'payment': 7,
 'send': 8,
 'genetal': 5,
 'company': 2,
 'tour': 9}

In [20]:
email = ['promation!! hu good sales sale check payment']

In [21]:
result1 = cv1.transform(email).toarray() #spam
result1 #2차원

array([[0, 0, 0, 1, 0, 0, 0, 1, 1, 0]], dtype=int64)

In [22]:
result2 = cv2.transform(email).toarray() #ham
result2

array([[0, 1, 0, 0, 0, 0, 0, 1, 0, 0]], dtype=int64)

In [23]:
spam_feature_list = cv1.get_feature_names()
ham_feature_list = cv2.get_feature_names()

In [24]:
print(spam_feature_list)
print(ham_feature_list)

['advertise', 'big', 'buy', 'hu', 'member', 'news', 'promotion', 'sale', 'sales', 'special']
['agree', 'check', 'company', 'confirm', 'customer', 'genetal', 'order', 'payment', 'send', 'tour']


In [25]:
result1[0]

array([0, 0, 0, 1, 0, 0, 0, 1, 1, 0], dtype=int64)

In [26]:
spam_words_count = sum(result1[0]) ## 스팸에 등장하는 단어의 빈도수
spam_words_count 

3

In [27]:
ham_words_count = sum(result2[0])
ham_words_count

2

In [28]:
if spam_words_count > ham_words_count:
    print('spam mail!!')
else:
    print('ham mail!!')

spam mail!!


In [29]:
result11 = tfidf1.transform(email).toarray() #spam
result11

array([[0.        , 0.        , 0.        , 0.57735027, 0.        ,
        0.        , 0.        , 0.57735027, 0.57735027, 0.        ]])

In [30]:
result22 = tfidf2.transform(email).toarray() #ham
result22

array([[0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.        ]])

In [31]:
spam_words_count2= sum(result11[0]) ## 스팸에 등장하는 단어의 빈도수
spam_words_count2 

1.7320508075688772

In [32]:
ham_words_count2 = sum(result22[0])
ham_words_count2

1.414213562373095

In [33]:
if spam_words_count2 > ham_words_count2:
    print('spam mail!!')
else:
    print('ham mail!!')

spam mail!!
