In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Reference
* [Fares Sayah - Natural Language Processing (NLP) 🧾 for Beginners](https://www.kaggle.com/faressayah/natural-language-processing-nlp-for-beginners)
* [Ravi Chaubey - Natural Language Processing with Python](https://www.kaggle.com/ravichaubey1506/natural-language-processing-with-python)
* [Madz2000 - Simple EDA with Data Cleaning & GloVe(98%Accuracy)](https://www.kaggle.com/madz2000/simple-eda-with-data-cleaning-glove-98-accuracy)
* [adityapatil - Spam detector using NLP and Random Forest](https://www.kaggle.com/adityapatil673/spam-detector-using-nlp-and-random-forest)
* [Shekhar - Spam Detection using NLP and Random Forest](https://www.kaggle.com/shekhart47/spam-detection-using-nlp-and-random-forest)

# Dataset overview    
* 探索式資料分析 (EDA)
    * EDA (Exploratory Data Analysis) uses **visualization** and **basic statistics** to get an overview of the data we have, in order to do more complicated and thorough analysis to it.
    * EDA should let us be able to achieve the following three main things:
        1. To Know the Data - what information does the data provide, the structure of the data, etc.
        2. Check the Data - if there’s any outliers or unusual value.
        3. Correlation between Data - find out important variables.
    * We can also check if the data meet our assumption of it and figure out latent errors before actually building the model, so as to do adjusts for the further analysis.

In [None]:
# Reading a text-based dataset into pandas
def readData_rawSMS(filepath):
    data_rawSMS = pd.read_csv(filepath, header=0, usecols=[0,1], encoding='latin-1')
    data_rawSMS.columns = ['label', 'content']
    return data_rawSMS

data_rawSMS = readData_rawSMS(os.path.join(dirname, filename))
data_rawSMS.head()

# 垃圾訊息(spam) / 有效訊息(ham)

* 定義 `readData_rawSMS` function
    * header
        * `0`: 第一列(橫)為欄位名稱
            > 即 v1, v2
        * `1`: 第二列(橫)為欄位名稱
            > 即 ham, Go until jurong point, crazy.. Available only ...
        * `None`: 本資料(spam.csv)沒有欄位名稱
    * usecols
        * `[0,1]`: 僅使用第一行(直)和第二行(直)的資料，其他行(直)略過不讀取且不使用。
    * data_rawSMS.columns = ['label', 'content']
        > 重新命名欄位名稱：由 `v1, v2` 改為 `label, content`

In [None]:
# Generate descriptive statistics
data_rawSMS.describe()

In [None]:
# Group large amounts of data and compute operations on these groups
data_rawSMS.groupby('label').describe()

In [None]:
# Make a new column `length` to detect how long the content are.
data_rawSMS['length'] = data_rawSMS['content'].apply(len)
data_rawSMS.head()

* `data_rawSMS['length'] = data_rawSMS['content'].apply(len)`
    * 使用 len 函數 計算 `data_rawSMS` DataFrame 的 `content`，並將結果給予另增的欄位 `length`。
* `DataFrame.apply(func, axis=0)`
    * Apply a function along an axis of the DataFrame.
    * Example: `df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])`
        ```
           A  B
        0  4  9
        1  4  9
        2  4  9
        ```
        * df.apply(np.sum, axis=0)
            ```
            A    12
            B    27
            dtype: int64
            ```
        * df.apply(np.sum, axis=1)
            ```
            0    13
            1    13
            2    13
            dtype: int64
            ```

In [None]:
data_rawSMS.groupby('label').describe()

# Data Visualization

In [None]:
data_rawSMS[data_rawSMS.label=='ham']

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("ggplot")

plt.figure(figsize=(6, 4))

data_rawSMS[data_rawSMS.label=='ham'].length.plot(
    bins=35, kind='hist', color='blue', 
    label='Ham messages', alpha=0.6)
data_rawSMS[data_rawSMS.label=='spam'].length.plot(
    kind='hist', color='red', 
    label='Spam messages', alpha=0.6)

plt.legend()
plt.xlabel("Message Length")
#Through just basic EDA we've been able to discover a trend that `spam messages` tend to have `more characters`.

```
DataFrame.plot(x=None, y=None, kind='line', ax=None, subplots=False, sharex=None, sharey=False, layout=None, figsize=None, 
    use_index=True, title=None, grid=None, legend=True, style=None, logx=False, logy=False, loglog=False, 
    xticks=None, yticks=None, xlim=None, ylim=None, rot=None, fontsize=None, colormap=None, table=False, 
    yerr=None, xerr=None, secondary_y=False, sort_columns=False, **kwds)
```
* bins: int or sequence or str, default: rcParams["hist.bins"] (default: 10)
    * 把分佈(長條)切成 N 等分，N 預設為 10。
        > 即長條總共有 10 個。
* legend: Place legend on axis subplots
    * 放置圖例

In [None]:
data_rawSMS.length.describe()

In [None]:
data_rawSMS[data_rawSMS.label=='ham'].describe()

In [None]:
data_rawSMS[data_rawSMS['length'] == 910]

In [None]:
data_rawSMS[data_rawSMS['length'] == 910]['content'].iloc[0]
# data_rawSMS[data_rawSMS.length == 910].content.iloc[0]     #same result

* https://ithelp.ithome.com.tw/articles/10194006
* `pandas.DataFrame.iloc`
    * Purely integer-location based indexing for selection by position. (用 index 位置來取我們要的資料)
* `pandas.DataFrame.loc`
    * Access a group of rows and columns by label(s) or a boolean array. (用 標籤 來取出資料)

# Text Pre-processing
* The simplest is the `詞袋模型 (Bag-of-words Model, BoW)` approach, where each unique word in a text will be represented by one number.
    ![](https://miro.medium.com/max/875/1*ujkZ3JrQ6ubSuEpepHE4Aw.png)
    > [NLP 入門 (1) — Text Classification (Sentiment Analysis) — 極簡易情感分類器 Bag of words + Naive Bayes](https://sfhsu29.medium.com/nlp-%E5%85%A5%E9%96%80-1-text-classification-sentiment-analysis-%E6%A5%B5%E7%B0%A1%E6%98%93%E6%83%85%E6%84%9F%E5%88%86%E9%A1%9E%E5%99%A8-bag-of-words-naive-bayes-e40d61de9a7f)
    
    * 如何利用 bag-of-words 方法將文字轉換成數字？
        > [NLP的基本執行步驟(II) — Bag of Words 詞袋語言模型](https://medium.com/@derekliao_62575/nlp%E7%9A%84%E5%9F%BA%E6%9C%AC%E5%9F%B7%E8%A1%8C%E6%AD%A5%E9%A9%9F-ii-bag-of-words-%E8%A9%9E%E8%A2%8B%E8%AA%9E%E8%A8%80%E6%A8%A1%E5%9E%8B-3b670a0c7009)
        * Example: 
            * a. 看到他我就不爽。(先斷詞) => 看到/他/我/就/不爽
            * b. 看到他我就火大。(先斷詞) => 看到/他/我/就/火大
        * 這個語料庫的詞袋長這樣：(就 ,他 , 看到, 我 , 火大, 不爽)
            * Notice! 
                * 詞袋裡的詞「沒有絕對的順序關係」，此為隨機排法。
        * 接著，如果一個詞在句子中有出現，我們就幫他做一個記號，如下：
            * a. 表示成：[1, 1, 1, 1, 0, 1] (a 句沒有「火大」，標示為 0)
            * b. 表示成：[1, 1, 1, 1, 1, 0] (b 句沒有「不爽」，標示為 0)
        * 像這樣用 1 跟 0 來表示句子中詞語有沒有出現的方式，它有個酷炫名字：『獨熱編碼 (One-hot encoding)』
            * 利用獨熱編碼的方式，一個句子可以被轉換成一個向量的形式表達(向量 "vector" 就是一列數字而已)，就可以達成簡單的文字轉換成數字。
    * BoW 的衍生模型：
        * TF-IDF
            * 獲取一個能代表一個詞在文件中重要程度的數值。
        * CBoW (Continuous Bag of Words, 連續詞袋模型)
            * 這個模型是一個淺層的類神經網路。
            * 相較於傳統詞袋模型，CBoW 的輸入一樣是獨熱的形式，但不同的點是，CBoW 模型會將一開始每個詞都透過中間的隱藏層作轉換，讓每個詞的詞向量中不再只包含0與1，而是有意義的數值。
                * 中間的隱藏層轉換是怎麼進行的？
                    * CBoW 會同時參考**一個詞前後的語境**來決定那個詞所代表的詞向量是什麼。
                        > Ex:「不爽」和「火大」前面所接的詞都是「看到」、「他」、「我」、「就」
                        > 因此模型就會判斷：「不爽」和「火大」可能表現出相似的語意和句法結構，這個兩個詞就會被賦予非常接近的詞向量。
        * Word2vec
            * 由「連續詞袋模型 CBoW」和「跳字模型 skip-gram」構成 word2vec 模型。
            * 由 Google 的 Mikolov 等人在 2013 年提出。

In [None]:
from nltk.corpus import stopwords

print(stopwords.words('english'))

import string

print(string.punctuation)

In [None]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']
    
    # Check characters to see if they are in punctuation
    # 非標點符號的 characters，就存進 list
    nopunc = [char for char in mess if char not in string.punctuation] 
    
    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    # 非停用字的 word，轉成小寫字母後就存進 list
    return ' '.join([word.lower() for word in nopunc.split() if word.lower() not in STOPWORDS]) 

data_rawSMS['clean_msg'] = data_rawSMS.content.apply(text_process)
data_rawSMS.head()

In [None]:
# Unigram Analysis
from collections import Counter

def get_words(content):
    words = []
    for row in content:
        for j in row.split():
            words.append(j.strip())
    return words

counter = Counter(get_words(data_rawSMS['clean_msg']))
most_common = dict(counter.most_common(20))
print(most_common)


In [None]:
# Unigram Analysis

import seaborn as sns
import matplotlib.pyplot as plt

sns.barplot(x=list(most_common.values()), y=list(most_common.keys()))

In [None]:
from collections import Counter

words = data_rawSMS[data_rawSMS.label=='ham'].clean_msg.apply(lambda x: [word for word in x.split()])
ham_words = Counter()

for msg in words:
    ham_words.update(msg)
    
print(ham_words.most_common(50))

In [None]:
from collections import Counter

words = data_rawSMS[data_rawSMS.label=='spam'].clean_msg.apply(lambda x: [word for word in x.split()])
spam_words = Counter()

for msg in words:
    spam_words.update(msg)
    
print(spam_words.most_common(50))

# [Example] Vectorization
* We'll do that in 3 steps using the `bag-of-words model`:
  1. Count how many times does a word occur in each message (**term frequency**)
  2. Weigh the counts, so that frequent tokens get lower weight (**inverse document frequency**)
  3. Normalize the vectors to unit length, to abstract from the original text length (**L2 norm**)

In [None]:
# Use `CountVectorizer` to "convert text into a matrix of token counts"
from sklearn.feature_extraction.text import CountVectorizer


# The raw data, a sequence of symbols cannot be fed directly to the machine learning algorithms.
# They require numerical feature vectors with a fixed size.
simple_train = ['call you tonight call', 'Call me a cab', 'Please call me... PLEASE!']

# 1. import and instantiate CountVectorizer (with the default parameters)
vect = CountVectorizer()

# 2. learn the 'vocabulary' of the training data (occurs in-place)
vect.fit(simple_train)

# 3. examine the fitted vocabulary
print(vect.get_feature_names())

In [None]:
# 4. transform training data into a 'document-term matrix'
simple_train_dtm = vect.transform(simple_train)
print(simple_train_dtm)
"""
   0       1      2       3        4         5
['cab', 'call', 'me', 'please', 'tonight', 'you'] = vect.get_feature_names()

-> 'call you tonight'
->   0    5     4

(docID, wordID)  word count
  (0, 1)	1
  (0, 4)	1
  (0, 5)	1
  
index  0 1 2 3 4 5
====> [0 1 0 0 1 1]
"""
print()
print(simple_train_dtm.toarray())

# 5. examine the vocabulary and document-term matrix together
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())

In [None]:
# 6. example text for model testing
simple_test = ["please don't call me"]

# 7. transform testing data into a document-term matrix (using existing vocabulary)
simple_test_dtm = vect.transform(simple_test)
"""
   0       1      2       3        4         5
['cab', 'call', 'me', 'please', 'tonight', 'you'] = vect.get_feature_names()

-> "please don't call me"
->     3    x     1   2
vector -> [[0, 1, 1, 1, 0 , 0]]
"""

pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names())

# Machine Learning workflow with Vectorization
## 1. Divided into training set and testing set

In [None]:
# convert label to a numerical variable
data_rawSMS['label_num'] = data_rawSMS.label.map({'ham':0, 'spam':1})
data_rawSMS.head()

In [None]:
X = data_rawSMS.clean_msg
y = data_rawSMS.label_num
print(X.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0, train_size=0.8)

print('【Training set】')
print('Row_count: {}\n\nData content:\n{}\n'.format(X_train.shape, X_train))
print('Row_count: {}\n\nData label:\n{}\n'.format(y_train.shape, y_train))
print('------------')
print('【Testing set】')
print('Row_count: {}\n\nData:\n{}\n'.format(X_test.shape, X_test))
print('Row_count: {}\n\nData label:\n{}\n'.format(y_test.shape, y_test))

* `train_test_split()`
    1. `random_state`
        * This ensures that if I have to rerun my code, I’ll get the exact same train-test split, so my results won’t change.
    2. `stratify=y`
        * This tells train_test_split to make sure that the training and test datasets contain examples of each class **in the same proportions as in the original dataset**. 
        * 由於 classes 的不平衡性(imbalanced)，因此特別重要！
        * 若完全隨機地拆成 Train 和 Test，容易造成某個小類別在 Test 有、但在 Train 沒有的情況發生，使得 model 無法辨識那個小類別(因為 Train 沒有，所以沒辦法學習)！
            > A random split could easily end up with all examples of the smallest class in the test set and none in the training set, and then the model would be unable to identify that class.

## 2. Vectorization (+ Feature Engineering)
### Training data
* CountVectorizer
* TfidfTransformer

In [None]:
### Method1 ###
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()

vect.fit(X_train)
# print(len(vect.get_feature_names()))
# print(vect.get_feature_names())


# learn training data vocabulary, then use it to create a document-term matrix
X_train_dtm = vect.transform(X_train)

# word's TF
pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())

In [None]:
### Method2 ###
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()

# equivalently: combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)

In [None]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(X_train_dtm)
tfidf_transformer.transform(X_train_dtm)

In [None]:
pd.options.display.float_format = "{:,.10f}".format

# word's TF-IDF weight
TFIDF = tfidf_transformer.transform(X_train_dtm).toarray()
pd.DataFrame(TFIDF, columns=vect.get_feature_names()) 

In [None]:
pd.DataFrame(tfidf_transformer.transform(X_train_dtm).toarray(), columns=vect.get_feature_names()) #.iloc[4170]     #TF-IDF (L2 norm)

## TF-IDF Explain

In [None]:
df = pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names()) #['ìï'].iloc[4170]
df.loc[df['ìï'] != 0]

In [None]:
TF = pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())['ìï'].iloc[4170]
TF

In [None]:
pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())['ìï'].sum()

In [None]:
pd.options.display.float_format = "{:,.4f}".format

# word's IDF weight
pd.DataFrame([tfidf_transformer.idf_], columns=vect.get_feature_names()) 

In [None]:
for i in range(len(vect.get_feature_names())):
    if 'ìï' == vect.get_feature_names()[i]:
        print(i)
        # 'ìï' => index 8147

In [None]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(tfidf_transformer.transform(X_train_dtm).toarray(), columns=vect.get_feature_names()).iloc[4170]



### TF-IDF (L2 norm) ###
# ìï             0.4197

### TF ###
# "ìï" 在第 4170 篇，僅出現一次。

### IDF ###
# import math
# TF = 1
# DF = 36
# total_Doc = 4179
# math.log((4179 + 1)/(36 + 1)) + 1 = 5.727148612874577

### Testing data
* TfidfTransformer
    * transform testing data (using fitted vocabulary) into a document-term matrix

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# --- start --- Use Method2 --- #
from sklearn.feature_extraction.text import TfidfTransformer

vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(X_train_dtm)
tfidf_transformer.transform(X_train_dtm)
# --- end --- Use Method2 --- #


X_test_dtm = vect.transform(X_test)
print(X_test_dtm.toarray())

## 3. Building and Evaluating a model (依據特徵資料訓練分類器)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=20, random_state=0)

# train the model using X_train_dtm (timing it with an IPython "magic command")
%time clf = rf.fit(X_train_dtm, y_train)


In [None]:

import matplotlib.pyplot as plt

# Extract single tree
estimator = rf.estimators_[5]

n_nodes = rf.estimators_[4].tree_.node_count
print(estimator, n_nodes)

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# rf = RandomForestClassifier(n_estimators=20, oob_score=True, random_state=0)

# # train the model using X_train_dtm (timing it with an IPython "magic command")
# %time clf = rf.fit(X_train_dtm, y_train)

# ### OOB ###
# print(rf.oob_score_)

In [None]:
# rf.feature_importances_.shape
# rf.feature_importances_


df = pd.DataFrame([rf.feature_importances_], columns=vect.get_feature_names())
df.nlargest(20, 'ìï')

## Evaluate (Testing)
* accuracy_score
* precision_score
* recall_score
* f1_score

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# make class predictions for X_test_dtm
y_predTest_class = rf.predict(X_test_dtm)

# calculate accuracy of class predictions
print("【 Testing 】")
print('Accuracy score: {}'.format(accuracy_score(y_test, y_predTest_class)))
print('Precision score: {}'.format(precision_score(y_test, y_predTest_class)))
print('Recall score: {}'.format(recall_score(y_test, y_predTest_class)))
print('F1 score: {}'.format(f1_score(y_test, y_predTest_class)))

In [None]:
### 補充：calculate AUC (機器學習的效能衡量指標) ###
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, clf.predict_proba(X_test_dtm)[:,1])


# 當AUC = 1時，代表分類器非常完美，但這畢竟是理想狀況。
# 當AUC > 0.5時，代表分類器分類效果優於隨機猜測，模型有預測價值。
# https://ithelp.ithome.com.tw/articles/10229049
# https://medium.com/marketingdatascience/%E5%88%86%E9%A1%9E%E5%99%A8%E8%A9%95%E4%BC%B0%E6%96%B9%E6%B3%95-roc%E6%9B%B2%E7%B7%9A-auc-accuracy-pr%E6%9B%B2%E7%B7%9A-d3a39977022c

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, y_predTest_class))
print()
print(confusion_matrix(y_test, y_predTest_class))

In [None]:
# print the confusion matrix
cm = pd.DataFrame(confusion_matrix(y_test, y_predTest_class), index=['Ham','Spam'] , columns=['Ham','Spam'])
plt.figure(figsize=(3,3))
sns.heatmap(cm, cmap="Blues", linecolor='black', linewidth=1, annot=True, fmt='', xticklabels=['Ham','Spam'], yticklabels=['Ham','Spam'])

# x 軸：預測(Predict)
# y 軸：實際(Actual)

### Type I error  (嚴重) ###    Predict: Ham (0) & Actual: Spam (1)
print(len(X_test[y_predTest_class < y_test]), '個\t=> false negatives (spam incorrectly classifier)\n') # X_test[y_predTest_class < y_test]

### Type II error  (輕微) ###   Predict: Spam (1) & Actual: Ham (0)
print(len(X_test[y_predTest_class > y_test]), '個\t=> false positives (ham incorrectly classifier)\n') # X_test[y_predTest_class > y_test]
print(X_test[(y_predTest_class==1) & (y_test==0)]) # same result
print()
print(X_test.shape)

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score

# make class predictions for X_test_dtm
y_predTest_class = rf.predict(X_test_dtm)

precision, recall, fscore, support = score(y_test, y_predTest_class, pos_label=0, average='binary')
print('Precision : {} / Recall : {} / fscore : {} / Accuracy: {}'.format(round(precision,3),round(recall,3),round(fscore,3),round((y_predTest_class==y_test).sum()/len(y_test),3)))

precision, recall, fscore, support = score(y_test, y_predTest_class, pos_label=1, average='binary')
print('Precision : {} / Recall : {} / fscore : {} / Accuracy: {}'.format(round(precision,3),round(recall,3),round(fscore,3),round((y_predTest_class==y_test).sum()/len(y_test),3)))

## Input SMS and Predict

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier

# vect = CountVectorizer()
# X_train_dtm = vect.fit_transform(X_train)

# tfidf_transformer = TfidfTransformer()
# tfidf_transformer.fit(X_train_dtm)
# tfidf_transformer.transform(X_train_dtm)
    
# rf = RandomForestClassifier() #n_estimators=20
# rf.fit(X_train_dtm, y_train)

SMS = 'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005.'
clean_text = text_process(SMS)
# print(clean_text)
simple_test_dtm = vect.transform([clean_text])
print(simple_test_dtm.toarray(), simple_test_dtm.reshape(1,-1).shape)

y_predSimpleTest_class = rf.predict(simple_test_dtm.reshape(1,-1))
if int(y_predSimpleTest_class) == 1:
    print ('SPAM: {}'.format(SMS))
else:
    print ('ham: {}'.format(SMS))    

In [None]:
### Pipeline (補充) ###
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics

X = data_rawSMS.clean_msg
y = data_rawSMS.label_num
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

pipe = Pipeline([('bow', CountVectorizer()),
                 ('tfid', TfidfTransformer()),  
                 ('model', RandomForestClassifier(n_estimators=20, bootstrap=True, oob_score=False, random_state=1))])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

y_pred = pipe.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))



from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# make class predictions for X_test_dtm
y_predTest_class = rf.predict(X_test_dtm)

# calculate accuracy of class predictions
print("【 Testing 】")
print('Accuracy score: {}'.format(metrics.accuracy_score(y_test, y_pred)))
print('Precision score: {}'.format(metrics.precision_score(y_test, y_pred)))
print('Recall score: {}'.format(metrics.recall_score(y_test, y_pred)))
print('F1 score: {}'.format(metrics.f1_score(y_test, y_pred)))

print(classification_report(y_test, y_pred))