In [25]:
import os
import jieba
import string

### 1. Load Text Data

In [45]:
with open('ham_5000.utf8', encoding='utf-8') as f:
    ham_docs = f.readlines()
with open('spam_5001.utf8', encoding='utf-8') as f:
    spam_docs = f.readlines()

### 2. Clean Text Data
- We can filter out punctuation from tokens.
- We can remove tokens that are just punctuation or contain numbers by using an isalpha() check on each token.
- We can remove stop words.

In [58]:
with open('stopwords.txt', encoding='utf-8') as f:
    stopwords = f.read().split('\n')
    
def cleanText(docs):
    cleanedText = []
    for doc in docs:
        words = jieba.cut(doc)
        # remove remaining tokens that are not alphabetic, and
        # filter out stop words
        cleanedWords = [word for word in words
                       if word.isalpha() and word not in stopwords]
        sentence = ' '.join(cleanedWords)
        cleanedText.append(sentence)
    return cleanedText

clean_ham = cleanText(ham_docs)
clean_spam = cleanText(spam_docs)

In [63]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

def transformTextToSparseMat(text):
    vectorizer = CountVectorizer(binary=False)
    vector = vectorizer.fit_transform(text)
    vol = vectorizer.vocabulary_
    data = pd.DataFrame(vector.toarray())
    df = pd.DataFrame(vol.items(), columns=['key', 'value'])
    colnames = df.sort_values("value")["key"].values
    data.columns = colnames
    return data

data = []
data.extend(clean_ham)
data.extend(clean_spam)

full_features = transformTextToSparseMat(data)

features = pd.DataFrame(full_features.apply(sum,axis=0))
# keep tokens with > 5 occurrence
useful_features =  features.loc[features[0] > 5]
useful_features = useful_features.index.to_list()

X = full_features[useful_features]
y = []
y.extend(np.ones(5000))
y.extend(np.zeros(5001))

### 3. Save Prepared Data

In [68]:
X.to_csv('prepared.csv')