In [1]:
import os
import jieba
import string

### 1. Load Text Data

In [2]:
with open('ham_5000.utf8', encoding='utf-8') as f:
    ham_docs = f.readlines()
with open('spam_5001.utf8', encoding='utf-8') as f:
    spam_docs = f.readlines()

### 2. Clean Text Data
- We can filter out punctuation from tokens.
- We can remove tokens that are just punctuation or contain numbers by using an isalpha() check on each token.
- We can remove stop words.

In [3]:
with open('stopwords.txt', encoding='utf-8') as f:
    stopwords = f.read().split('\n')
    
def cleanText(docs):
    cleanedText = []
    for doc in docs:
        words = jieba.cut(doc)
        # remove remaining tokens that are not alphabetic, and
        # filter out stop words
        cleanedWords = [word for word in words
                       if word.isalpha() and word not in stopwords]
        sentence = ' '.join(cleanedWords)
        cleanedText.append(sentence)
    return cleanedText

clean_ham = cleanText(ham_docs)
clean_spam = cleanText(spam_docs)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Russe\AppData\Local\Temp\jieba.cache
Loading model cost 0.766 seconds.
Prefix dict has been built successfully.


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

def transformTextToSparseMat(text):
    vectorizer = CountVectorizer(binary=False)
    vector = vectorizer.fit_transform(text)
    vol = vectorizer.vocabulary_
    data = pd.DataFrame(vector.toarray())
    df = pd.DataFrame(vol.items(), columns=['key', 'value'])
    colnames = df.sort_values("value")["key"].values
    data.columns = colnames
    return data

data = []
data.extend(clean_ham)
data.extend(clean_spam)

full_features = transformTextToSparseMat(data)

features = pd.DataFrame(full_features.apply(sum,axis=0))
# keep tokens with > 5 occurrence
useful_features =  features.loc[features[0] > 5]
useful_features = useful_features.index.to_list()

X = full_features[useful_features]
y = []
y.extend(np.ones(5000))
y.extend(np.zeros(5001))

In [5]:
y = pd.DataFrame(y).astype('int8')

### 3. Save Prepared Data

In [68]:
X.to_csv('prepared.csv')

In [None]:
X = pd.read_csv('prepared.csv')

### 4. Split into Train and Test Sets

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(8000, 6647) (2001, 6647)
(8000, 1) (2001, 1)


### 5. Fit Neural Network

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

n_voc = X_train.shape[1]

# define the model
def define_model(n_voc):
    # define network
    model = Sequential()
    model.add(Dense(100, input_shape=(n_voc,), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize defined network
    model.summary()
    return model

model = define_model(n_voc)
# fit network
model.fit(X_train, y_train, epochs=10, verbose=2)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               664800    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 664,901
Trainable params: 664,901
Non-trainable params: 0
_________________________________________________________________
Train on 8000 samples
Epoch 1/10
8000/8000 - 1s - loss: 0.0991 - accuracy: 0.9824
Epoch 2/10
8000/8000 - 0s - loss: 0.0144 - accuracy: 0.9965
Epoch 3/10
8000/8000 - 0s - loss: 0.0075 - accuracy: 0.9984
Epoch 4/10
8000/8000 - 0s - loss: 0.0053 - accuracy: 0.9987
Epoch 5/10
8000/8000 - 0s - loss: 0.0042 - accuracy: 0.9989
Epoch 6/10
8000/8000 - 0s - loss: 0.0036 - accuracy: 0.9991
Epoch 7/10
8000/8000 - 0s - loss: 0.0032 - accuracy: 0.9992
Epoch 8/10
8000/8000 - 0s - loss: 0.0029 - accuracy: 0.9992
Epoch 9/10

<tensorflow.python.keras.callbacks.History at 0x22a3ba1c688>