### 載入所需的Libraries

In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from typing import List

### 載入資料

In [2]:
# read data from spam.csv

sms_data = pd.read_csv("spam.csv", encoding = "ISO-8859-1", engine='python')
sms_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
# change label to from string to number
# "ham" --> 0, "spam" --> 1
sms_data.loc[sms_data["v1"]=="ham", "v1"]=0
sms_data.loc[sms_data["v1"]=="spam", "v1"]=1
sms_data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,0,"Go until jurong point, crazy.. Available only ...",,,
1,0,Ok lar... Joking wif u oni...,,,
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,0,U dun say so early hor... U c already then say...,,,
4,0,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,,,
5568,0,Will Ì_ b going to esplanade fr home?,,,
5569,0,"Pity, * was in mood for that. So...any other s...",,,
5570,0,The guy did some bitching but I acted like i'd...,,,


In [7]:
# check how many spams and hams
counted_data = sms_data[["v1", "v2"]].groupby("v1").count()
counted_data = counted_data.reset_index()
counted_data.columns = ["label", "count"]
counted_data


Unnamed: 0,label,count
0,0,4825
1,1,747


### 切分資料
將資料依據label比例切分為training data與testing data

In [10]:
counted_data["ratio"]=counted_data["count"]/counted_data["count"].sum()
counted_data

sms_data = sms_data.rename(columns={"v1":"label"})
sms_train = sms_data.sample(frac=counted_data[counted_data["label"]==0]["ratio"][0])
sms_test = sms_data.drop(sms_train.index)

print(sms_train.groupby("label").count())
print(sms_test.groupby("label").count())

         v2  Unnamed: 2  Unnamed: 3  Unnamed: 4
label                                          
0      4176          38          10           6
1       649           4           2           0
        v2  Unnamed: 2  Unnamed: 3  Unnamed: 4
label                                         
0      649           7           0           0
1       98           1           0           0


In [11]:
sms_train.fillna('', inplace=True)
sms_test.fillna('', inplace=True)

### 資料預處理
* 將所有字詞轉為小寫
* 移除所有數字、標點符號

In [12]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    
    for i in range(len(df)):
        # make all content to lowercase
        df.at[i, "v2"] = df.iloc[i]["v2"].lower()
        df.at[i, "Unnamed: 2"] = df.iloc[i]["Unnamed: 2"].lower()
        df.at[i, "Unnamed: 3"] = df.iloc[i]["Unnamed: 3"].lower()
        df.at[i, "Unnamed: 4"] = df.iloc[i]["Unnamed: 4"].lower()
        
        # remove all numbers
        pattern = r"\d+"
        df.at[i, "v2"] = re.sub(pattern, " ", df.iloc[i]["v2"])
        df.at[i, "Unnamed: 2"] = re.sub(pattern, " ", df.iloc[i]["Unnamed: 2"])
        df.at[i, "Unnamed: 3"] = re.sub(pattern, " ", df.iloc[i]["Unnamed: 3"])
        df.at[i, "Unnamed: 4"] = re.sub(pattern, " ", df.iloc[i]["Unnamed: 4"])
            
        # remove all punctuations
        punctuation_list = ['.', ',', '!', '?']
        for punc in punctuation_list:
            df.at[i, "v2"] = df.iloc[i]["v2"].replace(punc, " ")
            df.at[i, "Unnamed: 2"] = df.iloc[i]["Unnamed: 2"].replace(punc, " ")
            df.at[i, "Unnamed: 3"] = df.iloc[i]["Unnamed: 3"].replace(punc, " ")
            df.at[i, "Unnamed: 4"] = df.iloc[i]["Unnamed: 4"].replace(punc, " ")
    
    return df

processed_train = preprocess(sms_train)
processed_test = preprocess(sms_test)

print("Train:")
print(processed_train.head())
print("Test:")
print(processed_test.head())

Train:
      label                                                 v2 Unnamed: 2  \
120     1.0            Ok, be careful ! Don't text and drive !              
3489    0.0  I probably won't eat at all today. I think I'm...              
3775    0.0                            Did you see that film:)              
3722    0.0  I want to sent  &lt;#&gt; mesages today. Thats...              
3258    1.0  Designation is software developer and may be s...              

     Unnamed: 3 Unnamed: 4  
120                         
3489                        
3775                        
3722                        
3258                        
Test:
    label                                                 v2 Unnamed: 2  \
10    0.0          Ela kano.,il download, come wen ur free..              
17    0.0  Hi frnd, which is best way to avoid missunders...              
29    0.0  Hey I am really horny want to chat or see me n...              
36    0.0  Hurt me... Tease me... Make me cry... B

In [13]:
# split data into x_train, y_train, x_test, y_test
label_train,v2_train,u2_train,u3_train,u4_train = zip(*processed_train.values)
label_test,v2_test,u2_test,u3_test,u4_test = zip(*processed_test.values)
# check numbers of unique word in the corpus
corpos_train = list(set(" ".join(list(v2_train + u2_train + u3_train + u4_train)).split()))
corpos_test = list(set(" ".join(list(v2_test + u2_test + u3_test + u4_test)).split()))
len(set(corpos_train+corpos_test))

11655

In [14]:
punctuation_list = ['.', ',', '!', '?']
pattern = r"\d+"
new_corpos_train = []
for punc in punctuation_list:
    for word in corpos_train:
        tmp_w = re.sub(pattern, "", word.replace(punc, "")).strip()
        if tmp_w:
            new_corpos_train.append(tmp_w)
corpos_train = new_corpos_train
corpos_train
        

['Dino',
 'allows',
 'bawling',
 'Saturday,',
 'side',
 'wtc',
 'finish',
 'GOIN',
 'u',
 'telly',
 'mon',
 'wish!',
 'Lr',
 'morphine',
 'forgets',
 'mental',
 'BT-national-rate',
 'especially',
 'ARE',
 'others',
 'intelligent,',
 'misss',
 'Helloooo',
 'against',
 'keeps',
 'Theory:',
 'Free-message:',
 'match',
 'Kay',
 '@"',
 'Unsubscribe',
 'current',
 'dismay',
 'FRND',
 'meso',
 'weird,',
 'Hol',
 'ready,',
 'stars',
 'battery',
 'nuclear',
 'goigng',
 'words',
 'p',
 'building',
 'shitload',
 '>>>More',
 'half',
 'prometazine',
 'Hav',
 'Gnarls',
 'NITE!!',
 'wks',
 'vomiting',
 'bringing',
 'simple',
 'control',
 'generally',
 'allowed',
 'pimpleseven',
 'BBD(pooja)',
 'should',
 'way!',
 'surprise',
 'haul',
 'anyone',
 'Serious',
 'nw,',
 'nÌÂte?',
 'bathe',
 'Okey',
 'msgs,',
 'frndship',
 'Ee',
 '(/M)',
 'PROBPOP',
 'cut',
 'later',
 'IMAT',
 'msgsometext',
 'stock',
 'thou',
 'yo',
 'pissed',
 'shuhui',
 'Dave',
 'treacle?',
 'effects',
 'mofo',
 'philosophy',
 'finally'

In [None]:
del tmp

### TF-IDF
可以發現種共有7708個字詞，這裡使用TF-IDF將來選取最高分的前2000個字詞
(若忘記的學員可參考先前TF-IDF課程章節或[此篇教學](https://ithelp.ithome.com.tw/articles/10228815?sc=iThelpR))

In [36]:
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import math
from sklearn.preprocessing import normalize


n = 2000

vectorizer = TfidfVectorizer(sublinear_tf=False, stop_words=None,  smooth_idf=True, norm='l2')

tfidf = vectorizer.fit_transform(corpos_train+corpos_test)
print(f"Number of unique word of tfidf_train: {len(vectorizer.vocabulary_)}")
tfidf_sorting = np.argsort(tfidf.toarray()).flatten()[::-1]
top_n = np.array(vectorizer.get_feature_names())[tfidf_sorting][:n]
print(top_n)

# tfidf_train = vectorizer.fit_transform(corpos_train)
# print(f"Number of unique word of tfidf_train: {len(vectorizer.vocabulary_)}")
# tfidf_train_sorting = np.argsort(tfidf_train.toarray()).flatten()[::-1]
# top_n_train = np.array(vectorizer.get_feature_names())[tfidf_train_sorting][:n]
# print(top_n_train)

# tfidf_test = vectorizer.fit_transform(corpos_test)
# print(f"Number of unique word of tfidf_test: {len(vectorizer.vocabulary_)}")
# tfidf_test_sorting = np.argsort(tfidf_test.toarray()).flatten()[::-1]
# top_n_test = np.array(vectorizer.get_feature_names())[tfidf_test_sorting][:n]
# print(top_n_test)


Number of unique word of tfidf_train: 6828
['dabbles' 'ûò' 'frnds' ... 'bored' 'borin' 'boring']


### 建立共現矩陣

In [62]:
def create_co_matrix(corpus: List[str], vocab_list: List[str], word2idx: dict,
                     window_size: int=1, use_weighting: bool=False, verbose: bool=False) -> np.ndarray:
    '''Function to create co-occurrence matrix
    '''
    #initialize co-occurrence matrix
    co_matrix = np.zeros(shape=(len(vocab_list), len(vocab_list)), dtype=np.int32)
    sms_ids = []
    for idx, sms in enumerate(corpus):
        if sms and sms in word2idx:
#             print(f"idx:{idx}, sms:{sms}")
            sms_ids.insert(idx, word2idx[sms])
        

    for idx, sms in enumerate(corpus):
        for center_i, center_word_id in enumerate(sms_ids):
            if center_word_id:

                left_idx = idx - window_size if idx - window_size >= 0 else 0
                context_ids = sms[left_idx:idx]
                print(f"center_word_id: {center_word_id}, left_id:{left_id}")
                for left_i, left_id in enumerate(context_ids):
                    co_matrix[center_word_id, left_id] += 1
                    co_matrix[left_id, center_word_id] += 1
        
        if verbose:
            if idx != 0 and idx%500 == 0:
                    print(f"finishing {idx+1}/{len(corpus)}")
    print("Done")
#     if use_weighting:
#         if use weighting, then we set the co-occurrence with the word itself to 1.0
        ##<your code>###

    return co_matrix

co_matrix = create_co_matrix(corpos_train, vectorizer.get_feature_names(), vectorizer.vocabulary_,
                            window_size=3, use_weighting=True, verbose=True)

co_matrix
print(vectorizer.vocabulary_["allows"])
sms_ids=[]
sms_ids.insert(1,vectorizer.vocabulary_["allows"])
for center_i, center_word_id in enumerate(sms_ids):
    print(f"center_i: {center_i}, center_word_id: {center_word_id}")

TypeError: create_co_matrix() got multiple values for argument 'window_size'

### 建立PPMI矩陣

In [None]:
#定義正向點間互資訊

def ppmi(co_matrix: np.ndarray, eps: float=1e-8, verbose: bool=False):
    M = np.zeros_like(co_matrix, dtype=np.float32)
    N = np.sum(co_matrix)
    S = np.sum(co_matrix, axis=0)
    total = co_matrix.shape[0]*co_matrix.shape[1]

    cnt = 0
    
    for i in range(co_matrix.shape[0]):
        for j in range(co_matrix.shape[1]):
            pmi = np.log2(co_matrix[i, j]*N / (S[i]*S[j] + eps))
            M[i, j] = max(0, pmi)
            
            if verbose:
                cnt += 1
                if cnt % 10 == 0 or cnt == total:
                    print(f"{cnt}/{total} Done")
    
    return M

ppmi_matrix = ppmi(co_matrix, verbose=False)
ppmi_matrix

### 使用SVD降維
利用sklearn中的TruncatedSVD對co-occurrence matrix進行降維，並利用variance來找出最適合的維度
[參考文獻](https://medium.com/swlh/truncated-singular-value-decomposition-svd-using-amazon-food-reviews-891d97af5d8d)

(讀者可以嘗試使用SVD對PPMI進行降維)

In [None]:
# Program to find the optimal number of components for Truncated SVD
n_comp = range(10,150,10) # list containing different values of components
variance_sum = [] # explained variance ratio for each component of Truncated SVD

for dim in n_comp:
    U, S, V = np.linalg.svd(ppmi_matrix)
    variance_sum.append(V)

plt.plot(n_comp, variance_sum)
plt.xlabel('Number of components')
plt.ylabel("Explained Variance")
plt.title("Plot of Number of components v/s explained variance")
plt.show()

In [None]:
# choose 140 as final dimension to reduce to 
# 利用上述找到的最適合dimension來對co-occurrence matrix進行降維
U_reduce = U[:, 0:140]
U_reduce

### 使用KNN模型進行分類
在進行分類之前，先利用簡單的詞向量平均來計算文本向量

[參考文獻](https://medium.com/ai-academy-taiwan/nlp-%E4%B8%8D%E5%90%8C%E8%A9%9E%E5%90%91%E9%87%8F%E5%9C%A8%E6%96%87%E6%9C%AC%E5%88%86%E9%A1%9E%E4%B8%8A%E7%9A%84%E8%A1%A8%E7%8F%BE%E8%88%87%E5%AF%A6%E4%BD%9C-e72a2daecfc)

In [None]:
# get doc vector via take mean of all word vectors inside the corresponding document

def make_doc_vectors(corpus: List[str], word2idx: dict, vocab_list: List) -> List[np.ndarray]:
    
    # vectorizing data 
    # and make document vector by take mean to all word vecto
    doc_vec = []
    empty_doc_list = []
    for i, sms_msg in enumerate(corpus):
        sms_msg = [word2idx[word] for word in sms_msg.split() if word in vocab_list] #tokenize
        if len(sms_msg) > 0:
            sms_msg = np.array([re_co_matrix[ids] for ids in sms_msg]) #vectorize
            doc_vec.append(sms_msg.mean(axis=0))
        else:
            empty_doc_list.append(i)
            print(f"document {i} doesn't contain word in vocab_list")
            print(corpus[i])
            print("\n")
        
    return np.vstack(doc_vec), empty_doc_list

word2idx = vectorizer.vocabulary_
vocab_list = vectorizer.get_feature_names()

doc_vec_train, missing_train_list = make_doc_vectors(x_train, word2idx, vocab_list)
print("="*50)
doc_vec_test, missing_test_list = make_doc_vectors(x_test, word2idx, vocab_list)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# training
y_train_filter = np.delete(np.array(y_train), missing_train_list)

###<your code>###

# testing
y_test_filter = np.delete(np.array(y_test), missing_test_list)
###<your code>###

In [None]:
print(f"train acc: {np.sum(train_pred == y_train_filter) / len(y_train_filter)}")
print(f"train acc: {np.sum(test_pred == y_test_filter) / len(y_test_filter)}")