In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF, non_negative_factorization
from sklearn.feature_extraction.text import TfidfVectorizer


# ---------- using linear least square ----------------
def LSTSQDistance(spam, ham, test):
    X1 = np.linalg.lstsq(spam, test, rcond = None)[0]
    R1 = np.dot(spam, X1)-test
    X2 = np.linalg.lstsq(ham, test, rcond = None)[0]
    R2 = np.dot(ham, X2)-test
    [m, n] = R1.shape
    pos = 0
    neg = 0
    for i in range(n):
        # distance to the spam subspace
        d1 = np.linalg.norm(R1[:,i])
        # distance to the ham subspace
        d2 = np.linalg.norm(R2[:,i])
        if d1 >= d2:
            # not a spam
            neg = neg + 1
        else:
            # is a spam
            pos = pos + 1

    return pos, neg

In [3]:
# 3. use different folds--------------------
emails = pd.read_csv("emails.csv", encoding= 'unicode_escape')
df = pd.DataFrame(emails)
spam = df['spam']
flod = df['fold']

# initialize
cv = TfidfVectorizer(stop_words='english', max_features=20000, token_pattern=r"(?u)\b[a-zA-Z]\w+\b") 
dt_matrix = cv.fit_transform(df['text']) 
[m, n] = dt_matrix.shape

# For each email, we have two tags: spam and fold.
# Spam = 1 means the email is a spam; spam = 0 means the email is not a spam
# And for each email, it also belongs to a fold.
# In this file, there are 5 fold, numbered from 0 to 4.
# In this example, we will use fold 0-3 as training data and fold 4 as test data.

for t in range(0,5):
    
    print("take fold", t, "as testing data")

    spam_train = (dt_matrix[[i for i in range(m) if flod[i]!=t and spam[i]==1], :]).toarray().transpose()
    spam_test  = (dt_matrix[[i for i in range(m) if flod[i]==t and spam[i]==1], :]).toarray().transpose()
    ham_train = (dt_matrix[[i for i in range(m) if flod[i]!=t and spam[i]==0], :]).toarray().transpose()
    ham_test  = (dt_matrix[[i for i in range(m) if flod[i]==t and spam[i]==0], :]).toarray().transpose()



    # Compute the confusion matrix
    tp, fn = LSTSQDistance(spam_train, ham_train, spam_test)
    fp, tn = LSTSQDistance(spam_train, ham_train, ham_test)

    N = len(spam_test[0]) + len(ham_test[0])
    #print(N, " " , tp+tn)
    print("--------------------------------------------------")
    print("\t\t\tIs a spam\tIs a ham")
    print("Predicted as a spam\t", tp, "(TP)\t", fp, "(FP)")
    print("\nPredicted as a ham\t", fn, "(FN)\t", tn, "(TN)")
    print("--------------------------------------------------")
    print("Accuracy = " , round((tp+tn) / N, 3))
    print("Precision = " , round((tp)/(tp + fp), 3))
    print("Recall = " , round((tp)/(tp + fn), 3))
    print()





take fold 0 as testing data
--------------------------------------------------
			Is a spam	Is a ham
Predicted as a spam	 202 (TP)	 0 (FP)

Predicted as a ham	 67 (FN)	 870 (TN)
--------------------------------------------------
Accuracy =  0.941
Precision =  1.0
Recall =  0.751

take fold 1 as testing data
--------------------------------------------------
			Is a spam	Is a ham
Predicted as a spam	 220 (TP)	 0 (FP)

Predicted as a ham	 68 (FN)	 877 (TN)
--------------------------------------------------
Accuracy =  0.942
Precision =  1.0
Recall =  0.764

take fold 2 as testing data
--------------------------------------------------
			Is a spam	Is a ham
Predicted as a spam	 199 (TP)	 1 (FP)

Predicted as a ham	 70 (FN)	 877 (TN)
--------------------------------------------------
Accuracy =  0.938
Precision =  0.995
Recall =  0.74

take fold 3 as testing data
--------------------------------------------------
			Is a spam	Is a ham
Predicted as a spam	 194 (TP)	 0 (FP)

Predicted as a h

In [2]:
# 4. SVD decomposition --------------------
emails = pd.read_csv("emails.csv", encoding= 'unicode_escape')
df = pd.DataFrame(emails)
spam = df['spam']
flod = df['fold']

# initialize
cv = TfidfVectorizer(stop_words='english', max_features=20000, token_pattern=r"(?u)\b[a-zA-Z]\w+\b") 
dt_matrix = cv.fit_transform(df['text']) 
[m, n] = dt_matrix.shape

# For each email, we have two tags: spam and fold.
# Spam = 1 means the email is a spam; spam = 0 means the email is not a spam
# And for each email, it also belongs to a fold.
# In this file, there are 5 fold, numbered from 0 to 4.
# In this example, we will use fold 0-3 as training data and fold 4 as test data.

t = 4 # 0, 1, 2, 3, 4
    
print("take fold", t, "as testing data and doing SVD decomposition")

spam_train = (dt_matrix[[i for i in range(m) if flod[i]!=t and spam[i]==1], :]).toarray().transpose()
spam_test  = (dt_matrix[[i for i in range(m) if flod[i]==t and spam[i]==1], :]).toarray().transpose()
ham_train = (dt_matrix[[i for i in range(m) if flod[i]!=t and spam[i]==0], :]).toarray().transpose()
ham_test  = (dt_matrix[[i for i in range(m) if flod[i]==t and spam[i]==0], :]).toarray().transpose()

spam_U, spam_sigma, spam_vt = np.linalg.svd(spam_train)
ham_U, ham_sigma, ham_vt = np.linalg.svd(ham_train)

s_rank = np.linalg.matrix_rank(spam_train)
spam_new = spam_U[:, :s_rank]

h_rank = np.linalg.matrix_rank(ham_train)
ham_new = ham_U[:, :h_rank]


# Compute the confusion matrix
tp, fn = LSTSQDistance(spam_new, ham_new, spam_test)
fp, tn = LSTSQDistance(spam_new, ham_new, ham_test)

N = len(spam_test[0]) + len(ham_test[0])
#print(N, " " , tp+tn)
print("--------------------------------------------------")
print("\t\t\tIs a spam\tIs a ham")
print("Predicted as a spam\t", tp, "(TP)\t", fp, "(FP)")
print("\nPredicted as a ham\t", fn, "(FN)\t", tn, "(TN)")
print("--------------------------------------------------")
print("Accuracy = " , round((tp+tn) / N, 3))
print("Precision = " , round((tp)/(tp + fp), 3))
print("Recall = " , round((tp)/(tp + fn), 3))
print()





take fold 4 as testing data and doing SVD decomposition
--------------------------------------------------
			Is a spam	Is a ham
Predicted as a spam	 209 (TP)	 1 (FP)

Predicted as a ham	 73 (FN)	 861 (TN)
--------------------------------------------------
Accuracy =  0.935
Precision =  0.995
Recall =  0.741



In [None]:
emails = pd.read_csv("emails.csv", encoding= 'unicode_escape')
df = pd.DataFrame(emails)
spam = df['spam']
flod = df['fold']

# initialize
cv = TfidfVectorizer(stop_words='english', max_features=20000, token_pattern=r"(?u)\b[a-zA-Z]\w+\b") 
dt_matrix = cv.fit_transform(df['text']) 
[m, n] = dt_matrix.shape
spam_train = (dt_matrix[[i for i in range(m) if flod[i]!=4 and spam[i]==1], :]).toarray().transpose()
spam_test  = (dt_matrix[[i for i in range(m) if flod[i]==4 and spam[i]==1], :]).toarray().transpose()
ham_train = (dt_matrix[[i for i in range(m) if flod[i]!=4 and spam[i]==0], :]).toarray().transpose()
ham_test  = (dt_matrix[[i for i in range(m) if flod[i]==4 and spam[i]==0], :]).toarray().transpose()

spam_U, spam_sigma, spam_vt = np.linalg.svd(spam_train)
ham_U, ham_sigma, ham_vt = np.linalg.svd(ham_train)

In [8]:
h_rank = np.linalg.matrix_rank(ham_train)
ham_new = ham_U[:, :h_rank]
print(h_rank)
print(len(ham_train[0]))

3336
3496


In [7]:
s_rank = np.linalg.matrix_rank(spam_train)
spam_new = spam_U[:, :s_rank]
print(s_rank)
print(len(spam_train[0]))

1032
1091


In [2]:
# 5. LSA Try different low rank approximation using SVD for S and H------------------------------------------------
emails = pd.read_csv("emails.csv", encoding= 'unicode_escape')
df = pd.DataFrame(emails)
spam = df['spam']
flod = df['fold']

# initialize
cv = TfidfVectorizer(stop_words='english', max_features=20000, token_pattern=r"(?u)\b[a-zA-Z]\w+\b") 
dt_matrix = cv.fit_transform(df['text']) 
[m, n] = dt_matrix.shape
spam_train = (dt_matrix[[i for i in range(m) if flod[i]!=4 and spam[i]==1], :]).toarray().transpose()
spam_test  = (dt_matrix[[i for i in range(m) if flod[i]==4 and spam[i]==1], :]).toarray().transpose()
ham_train = (dt_matrix[[i for i in range(m) if flod[i]!=4 and spam[i]==0], :]).toarray().transpose()
ham_test  = (dt_matrix[[i for i in range(m) if flod[i]==4 and spam[i]==0], :]).toarray().transpose()

h_rank = 600 #200, 200, 100
s_rank = 200 #600, 200, 600
spam_U, spam_sigma, spam_vt = np.linalg.svd(spam_train)
ham_U, ham_sigma, ham_vt = np.linalg.svd(ham_train)
ham_new = ham_U[:, :h_rank]
spam_new = spam_U[:, :s_rank]
tp, fn = LSTSQDistance(spam_new, ham_new, spam_test)
fp, tn = LSTSQDistance(spam_new, ham_new, ham_test)
print("tp=",tp, "fn=",fn)
print("fp=",fp, "tn=",tn)
print()
N = len(spam_test[0]) + len(ham_test[0])
#print(N, " " , tp+tn)
print("--------------------------------------------------")
print("\t\t\tIs a spam\tIs a ham")
print("Predicted as a spam\t", tp, "(TP)\t", fp, "(FP)")
print("\nPredicted as a ham\t", fn, "(FN)\t", tn, "(TN)")
print("--------------------------------------------------")
print("Accuracy = " , round((tp+tn) / N, 3))
print("Precision = " , round((tp)/(tp + fp), 3))
print("Recall = " , round((tp)/(tp + fn), 3))
print()

tp= 255 fn= 27
fp= 1 tn= 861

--------------------------------------------------
			Is a spam	Is a ham
Predicted as a spam	 255 (TP)	 1 (FP)

Predicted as a ham	 27 (FN)	 861 (TN)
--------------------------------------------------
Accuracy =  0.976
Precision =  0.996
Recall =  0.904



In [6]:
h_rank = 100 #200, 200, 100
s_rank = 600 #600, 200, 600
ham_new = ham_U[:, :h_rank]
spam_new = spam_U[:, :s_rank]
tp, fn = LSTSQDistance(spam_new, ham_new, spam_test)
fp, tn = LSTSQDistance(spam_new, ham_new, ham_test)
print("tp=",tp, "fn=",fn)
print("fp=",fp, "tn=",tn)
print()
N = len(spam_test[0]) + len(ham_test[0])
#print(N, " " , tp+tn)
print("--------------------------------------------------")
print("\t\t\tIs a spam\tIs a ham")
print("Predicted as a spam\t", tp, "(TP)\t", fp, "(FP)")
print("\nPredicted as a ham\t", fn, "(FN)\t\t", tn, "(TN)")
print("--------------------------------------------------")
print("Accuracy = " , round((tp+tn) / N, 3))
print("Precision = " , round((tp)/(tp + fp), 3))
print("Recall = " , round((tp)/(tp + fn), 3))
print()

tp= 281 fn= 1
fp= 104 tn= 758

--------------------------------------------------
			Is a spam	Is a ham
Predicted as a spam	 281 (TP)	 104 (FP)

Predicted as a ham	 1 (FN)		 758 (TN)
--------------------------------------------------
Accuracy =  0.908
Precision =  0.73
Recall =  0.996



In [5]:
# 6. use NMF decomposition --------------------
emails = pd.read_csv("emails.csv", encoding= 'unicode_escape')
df = pd.DataFrame(emails)
spam = df['spam']
flod = df['fold']

# initialize
cv = TfidfVectorizer(stop_words='english', max_features=20000, token_pattern=r"(?u)\b[a-zA-Z]\w+\b") 
dt_matrix = cv.fit_transform(df['text']) 
[m, n] = dt_matrix.shape

# For each email, we have two tags: spam and fold.
# Spam = 1 means the email is a spam; spam = 0 means the email is not a spam
# And for each email, it also belongs to a fold.
# In this file, there are 5 fold, numbered from 0 to 4.
# In this example, we will use fold 0-3 as training data and fold 4 as test data.

for t in range(0,5):
    
    print("take fold", t, "as testing data and doing NMF decomposition")

    spam_train = (dt_matrix[[i for i in range(m) if flod[i]!=t and spam[i]==1], :]).toarray().transpose()
    spam_test  = (dt_matrix[[i for i in range(m) if flod[i]==t and spam[i]==1], :]).toarray().transpose()
    ham_train = (dt_matrix[[i for i in range(m) if flod[i]!=t and spam[i]==0], :]).toarray().transpose()
    ham_test  = (dt_matrix[[i for i in range(m) if flod[i]==t and spam[i]==0], :]).toarray().transpose()
    
    s_W, s_H, s_n_iter = non_negative_factorization(spam_train, n_components=100, init='random', random_state=0)
    h_W, h_H, h_n_iter = non_negative_factorization(ham_train, n_components=100, init='random', random_state=0)


    # Compute the confusion matrix
    tp, fn = LSTSQDistance(s_W, h_W, spam_test)
    fp, tn = LSTSQDistance(s_W, h_W, ham_test)

    N = len(spam_test[0]) + len(ham_test[0])
    #print(N, " " , tp+tn)
    print("--------------------------------------------------")
    print("\t\t\tIs a spam\tIs a ham")
    print("Predicted as a spam\t", tp, "(TP)\t", fp, "(FP)")
    print("\nPredicted as a ham\t", fn, "(FN)\t\t", tn, "(TN)")
    print("--------------------------------------------------")
    print("Accuracy = " , round((tp+tn) / N, 3))
    print("Precision = " , round((tp)/(tp + fp), 3))
    print("Recall = " , round((tp)/(tp + fn), 3))
    print()


take fold 0 as testing data and doing NMF decomposition
--------------------------------------------------
			Is a spam	Is a ham
Predicted as a spam	 265 (TP)	 11 (FP)

Predicted as a ham	 4 (FN)		 859 (TN)
--------------------------------------------------
Accuracy =  0.987
Precision =  0.96
Recall =  0.985

take fold 1 as testing data and doing NMF decomposition
--------------------------------------------------
			Is a spam	Is a ham
Predicted as a spam	 278 (TP)	 26 (FP)

Predicted as a ham	 10 (FN)		 851 (TN)
--------------------------------------------------
Accuracy =  0.969
Precision =  0.914
Recall =  0.965

take fold 2 as testing data and doing NMF decomposition
--------------------------------------------------
			Is a spam	Is a ham
Predicted as a spam	 261 (TP)	 17 (FP)

Predicted as a ham	 8 (FN)		 861 (TN)
--------------------------------------------------
Accuracy =  0.978
Precision =  0.939
Recall =  0.97

take fold 3 as testing data and doing NMF decomposition




--------------------------------------------------
			Is a spam	Is a ham
Predicted as a spam	 263 (TP)	 37 (FP)

Predicted as a ham	 2 (FN)		 834 (TN)
--------------------------------------------------
Accuracy =  0.966
Precision =  0.877
Recall =  0.992

take fold 4 as testing data and doing NMF decomposition




--------------------------------------------------
			Is a spam	Is a ham
Predicted as a spam	 275 (TP)	 19 (FP)

Predicted as a ham	 7 (FN)		 843 (TN)
--------------------------------------------------
Accuracy =  0.977
Precision =  0.935
Recall =  0.975



In [6]:
print(len(spam_train[1])) # 1091 筆資料  20000 個 term
print(len(spam_train))

print(len(ham_train[1]))# 3496 筆資料  20000 個 term
print(len(ham_train))

1091
20000
3496
20000


In [3]:
spam_U, spam_sigma, spam_vt = np.linalg.svd(spam_train)

In [4]:
print(spam_U)
print(len(spam_U))
print(len(spam_U[0]))
print(spam_U.shape)

[[-2.60208116e-04  2.75916139e-04 -2.48116643e-04 ...  4.23653853e-05
   0.00000000e+00  3.02419073e-05]
 [ 2.49502025e-17 -1.24615487e-17  8.63634793e-18 ... -1.55085105e-04
   0.00000000e+00 -2.62581413e-03]
 [ 1.01643954e-17 -8.14845695e-18  4.73999637e-18 ...  5.22148727e-04
   0.00000000e+00 -2.27343755e-04]
 ...
 [-1.92112741e-03  2.49278188e-03 -2.67662716e-03 ...  8.65656584e-01
   0.00000000e+00 -1.18697607e-03]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   1.00000000e+00  0.00000000e+00]
 [-2.59008585e-03  1.39807412e-03 -1.28690724e-03 ... -1.27844699e-03
   0.00000000e+00  6.81510685e-01]]
20000
20000
(20000, 20000)


In [10]:
print(spam_sigma)
print(len(spam_sigma))
k = np.pad(spam_sigma, (0,20000-len(spam_sigma)), mode = 'constant')

print(len(k))

spam_new = np.diag(k)
print(spam_new[:,0:2])
print(spam_sigma[1086])


[6.09537426e+00 5.21277509e+00 4.82273242e+00 ... 2.50009141e-16
 2.44853169e-16 1.91687379e-16]
1087
20000
[[6.09537426 0.        ]
 [0.         5.21277509]
 [0.         0.        ]
 ...
 [0.         0.        ]
 [0.         0.        ]
 [0.         0.        ]]
1.916873790483511e-16


In [11]:
ham_U, ham_sigma, ham_vt = np.linalg.svd(ham_train)

In [12]:
print(ham_U)
print(len(ham_U))
print(len(ham_U[0]))
print(ham_U.shape)

[[ 6.88990011e-03  7.33286204e-04 -5.34759903e-03 ...  0.00000000e+00
  -1.19270964e-04  0.00000000e+00]
 [ 6.24185292e-04 -3.24674226e-04 -1.52483019e-04 ...  0.00000000e+00
  -4.06435105e-05  0.00000000e+00]
 [ 3.66213508e-04  1.09030773e-03  6.63871666e-04 ...  0.00000000e+00
   2.59193786e-04  0.00000000e+00]
 ...
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  1.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 5.37650772e-05  1.23554667e-04 -2.40157510e-05 ...  0.00000000e+00
   9.84906727e-01  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  1.00000000e+00]]
20000
20000
(20000, 20000)


In [14]:
print(ham_sigma)
print(len(ham_sigma))
l = np.pad(ham_sigma, (0,20000-len(ham_sigma)), mode = 'constant')

print(len(l))

ham_new = np.diag(l)
print(ham_new[:,0:4])


[1.20562173e+01 7.08860748e+00 5.31320601e+00 ... 7.21198736e-16
 3.47141150e-16 2.70496958e-16]
3496
20000
[[12.05621729  0.          0.          0.        ]
 [ 0.          7.08860748  0.          0.        ]
 [ 0.          0.          5.31320601  0.        ]
 ...
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]]


In [23]:
tp, fn = LSTSQDistance(spam_new, ham_new, spam_test)
fp, tn = LSTSQDistance(spam_new, ham_new, ham_test)

print("tp=",tp, "fn=",fn)
print("fp=",fp, "tn=",tn)

tp= 208 fn= 73
fp= 1 tn= 861


In [32]:
emails = pd.read_csv("emails.csv")
df = pd.DataFrame(emails)
spam = df['spam']
flod = df['fold']

# initialize
cv = TfidfVectorizer(stop_words='english', max_features=20000, token_pattern=r"(?u)\b[a-zA-Z]\w+\b") 
dt_matrix = cv.fit_transform(df['text']) 
[m, n] = dt_matrix.shape
spam_train = (dt_matrix[[i for i in range(m) if flod[i]!=4 and spam[i]==1], :]).toarray().transpose()
spam_test  = (dt_matrix[[i for i in range(m) if flod[i]==4 and spam[i]==1], :]).toarray().transpose()
ham_train = (dt_matrix[[i for i in range(m) if flod[i]!=4 and spam[i]==0], :]).toarray().transpose()
ham_test  = (dt_matrix[[i for i in range(m) if flod[i]==4 and spam[i]==0], :]).toarray().transpose()
#spam_U, spam_sigma, spam_vt = np.linalg.svd(spam_train)
#ham_U, ham_sigma, ham_vt = np.linalg.svd(ham_train)

In [33]:
s_W, s_H, s_n_iter = non_negative_factorization(spam_train, n_components=100, init='random', random_state=0)
h_W, h_H, h_n_iter = non_negative_factorization(ham_train, n_components=100, init='random', random_state=0)

In [34]:
print(s_W.shape)
print(s_H.shape)
print(h_W.shape)
print(h_H.shape)

(20000, 100)
(100, 1087)
(20000, 100)
(100, 3496)


In [35]:
tp, fn = LSTSQDistance(s_W, h_W, spam_test)
fp, tn = LSTSQDistance(s_W, h_W, ham_test)
print("tp=",tp, "fn=",fn)
print("fp=",fp, "tn=",tn)
print()
N = len(spam_test[0]) + len(ham_test[0])
#print(N, " " , tp+tn)
print("--------------------------------------------------")
print("\t\t\tIs a spam\tIs a ham")
print("Predicted as a spam\t", tp, "(TP)\t", fp, "(FP)")
print("\nPredicted as a ham\t", fn, "(FN)\t\t", tn, "(TN)")
print("--------------------------------------------------")
print("Accuracy = " , round((tp+tn) / N, 3))
print("Precision = " , round((tp)/(tp + fp), 3))
print("Recall = " , round((tp)/(tp + fn), 3))
print()

tp= 275 fn= 6
fp= 20 tn= 842

--------------------------------------------------
			Is a spam	Is a ham
Predicted as a spam	 275 (TP)	 20 (FP)

Predicted as a ham	 6 (FN)	 842 (TN)
--------------------------------------------------
Accuracy =  0.977
Precision =  0.932
Recall =  0.979

