In [228]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import collections, functools, itertools

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import f1_score, classification_report

# Read Data

In [229]:
with open("spam.csv", encoding="latin-1") as f:
    df = pd.read_csv(f)
df.columns = ["label", "text", "dummy1", "dummy2", "dummy3"]

In [230]:
collections.Counter(df["label"])

Counter({'ham': 4825, 'spam': 747})

In [231]:
for col in ["dummy1", "dummy2", "dummy3"]:
    print(np.sum(~df[col].isnull()))

50
12
6


In [232]:
df.drop(["dummy1", "dummy2", "dummy3"], axis=1, inplace=True)
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## train test split

In [233]:
train_idx, test_idx = train_test_split(np.arange(df.shape[0]), test_size=0.2, random_state=42, shuffle=True, 
                                       stratify=df.label.values)

In [234]:
collections.Counter(df.loc[train_idx].label)

Counter({'ham': 3859, 'spam': 598})

In [235]:
collections.Counter(df.loc[test_idx].label)

Counter({'ham': 966, 'spam': 149})

In [236]:
df_train = df.loc[train_idx]
df_test = df.loc[test_idx]

In [237]:
labelbinarizer = LabelBinarizer()
Y = labelbinarizer.fit_transform(df["label"])

Y_train = Y[train_idx]
Y_test = Y[test_idx]

len(Y_train), len(Y_test)

(4457, 1115)

# data cleaning

In [238]:
import re
import nltk
#from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

In [239]:
def preProcess(text):
    text = text.lower() # lower case
    text = re.sub('<[^<>]+>', ' ', text);  # contain '<' or '>')... replace with a space
    text = re.sub('[0-9]+', 'number', text)  # mask for numbers
    text = re.sub('(http|https)://[^\s]*', 'httpaddr', text)  # mask for https header
    text = re.sub('[$]+', 'dollar', text)  # nask for '$' sign
    return text

def text2token(raw_text):
    text = preProcess(raw_text)
    text = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', text)
    tokens = []
    for subtext in text:
        # Remove any non alphanumeric characters
        subtext = re.sub('[^a-zA-Z0-9]', '', subtext);
        if not len(subtext): 
            continue
        tokens += word_tokenize(subtext)
    return tokens

In [240]:
df_train["clean_text"] = df_train["text"].apply(text2token)
df_test["clean_text"] = df_test["text"].apply(text2token)

In [241]:
df_train["clean_text"].head()

184                      [going, on, nothing, great, bye]
2171                [i, wont, so, wat, s, wit, the, guys]
5422    [ok, k, sry, i, knw, number, siva, tats, y, i,...
4113    [where, are, you, what, do, you, do, how, can,...
4588    [have, you, not, finished, work, yet, or, some...
Name: clean_text, dtype: object

# count-based SVM

In [242]:
corpus = list(itertools.chain(*df_train["clean_text"]))
vocab_dict = collections.Counter(corpus)

In [243]:
threshold = 5

len([v for _,v in vocab_dict.items() if v >= threshold])

1552

In [244]:
word2idx = {}
idx2word = {}

cnt = 0
for w, v in vocab_dict.items():
    if v >= threshold: # record the word which appears more than threshold times
        word2idx[w] = cnt
        idx2word[w] = w
        cnt +=1

len(word2idx), len(idx2word)

(1552, 1552)

In [245]:
def getfeature(wordlist):
    feature = np.zeros(len(word2idx))
    for w in wordlist:
        if w not in word2idx:
            continue
        if feature[word2idx[w]] == 0:
            feature[word2idx[w]] = 1
    return feature

In [246]:
X_train = np.array(df_train["clean_text"].apply(getfeature).values.tolist())
X_test = np.array(df_test["clean_text"].apply(getfeature).values.tolist())
# save data frame for random forest model
X_train_RF = X_train
X_test_RF = X_test
Y_train_RF = Y_train
Y_test_RF = Y_test

In [247]:
from sklearn import svm 

SVM = svm.SVC(C=0.1, kernel='rbf')
SVM.fit(X_train, Y_train.ravel())

SVC(C=0.1)

In [248]:
train_predictions = SVM.predict(X_train)
train_acc = np.sum(train_predictions == Y_train.ravel())/Y_train.shape[0]
print(f'Training accuracy = {100*train_acc:.2f}%')

test_predictions = SVM.predict(X_test)
test_acc = np.sum(test_predictions == Y_test.ravel())/Y_test.shape[0]
print(f'Testing accuracy = {100*test_acc:.2f}%')

Training accuracy = 94.73%
Testing accuracy = 94.08%


In [249]:
print(f1_score(Y_test, test_predictions))

0.7155172413793103


In [250]:
print(classification_report(Y_test, test_predictions))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       966
           1       1.00      0.56      0.72       149

    accuracy                           0.94      1115
   macro avg       0.97      0.78      0.84      1115
weighted avg       0.94      0.94      0.93      1115



# Word2Vec

In [251]:
from gensim.models import Word2Vec

dimension = 100
model = Word2Vec(sentences=itertools.chain(df_train["clean_text"]), 
                 size=dimension, window=5, min_count=1, workers=2)

In [252]:
df_train["text_len"] = df_train["clean_text"].map(len)
df_train["text_len"].describe(percentiles=[0.05,0.95])

count    4457.000000
mean       16.054745
std        11.725931
min         0.000000
5%          4.000000
50%        12.000000
95%        34.000000
max       190.000000
Name: text_len, dtype: float64

In [253]:
def getembedding(wordlist, maxlen=35):
    normalized_wordlist = wordlist[:maxlen]
    if len(normalized_wordlist) < maxlen: # pad to maxlen
        normalized_wordlist += [" "] * (maxlen-len(normalized_wordlist))
    return np.array([model.wv[w] if w in model.wv else np.zeros(dimension) for w in normalized_wordlist])

In [254]:
X_train = np.array(df_train["clean_text"].apply(getembedding).values.tolist())
X_test = np.array(df_test["clean_text"].apply(getembedding).values.tolist())
X_train.shape, X_test.shape

((4457, 35, 100), (1115, 35, 100))

# count-based Random Forest

In [255]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

Random_forest = RandomForestClassifier(n_estimators=50)
Random_forest.fit(X_train_RF, Y_train_RF.ravel())

randomForest_predict = Random_forest.predict(X_test_RF)
randomForest_score = metrics.accuracy_score(Y_test_RF.ravel(), randomForest_predict)
print("(Testing) Random Forest Score :", randomForest_score)

Y_hat = Random_forest.predict(X_test_RF)
n = np.size(Y_test_RF)
print('Testing Accuarcy: {:.6f}％ ({})'.format(sum(np.int_(Y_hat==Y_test_RF.ravel()))*100./n, Random_forest.__module__))

n=np.size(Y_train_RF.ravel())
Y_hat_RF = Random_forest.predict(X_train_RF)
print('Training Accuarcy RF: {:.6f}％'.format(sum(np.int_(Y_hat_RF==Y_train_RF.ravel()))*100./n))


(Testing) Random Forest Score : 0.9811659192825112
Testing Accuarcy: 98.116592％ (sklearn.ensemble._forest)
Training Accuarcy RF: 99.977563％


# Random Forest 2

In [256]:
import string
import nltk
stopwords= nltk.corpus.stopwords.words('english')

def clean_to_word(sentence):
    s = "".join(x for x in sentence if x not in string.punctuation)
    temp = s
    temp = s.lower().split(' ')
    temp2 = [x for x in temp if x not in stopwords]
    return temp2

In [257]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(analyzer=clean_to_word)
vector_output = vect.fit_transform(df['text'])
# print(vect.get_feature_names()[0:3])
# print (vector_output [0:3])

In [258]:
pd.DataFrame(vector_output.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9339,9340,9341,9342,9343,9344,9345,9346,9347,9348
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.184905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.147150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [259]:
# Feature Engineering, can explore further later
import re
df['len'] = df['text'].apply(lambda x : len(x) - x.count(" "))
df['long_number'] = df['text'].apply(lambda x : len(re.findall('\d{7,}',x)))

In [260]:
import string
def count_punct (text):
    count = sum([1 for x in text if x in string.punctuation])
    pp = round(100*count/(len(text)-text.count(" ")),3)
    return pp

df['punct'] = df['text'].apply(lambda x : count_punct(x))

testlink = "hello buddwwy http how com are you.co ww ww."

def  website (text):
    if (len(re.findall('www|http|com|\.co',text))>0):
        return 1
    else:
        return 0
df['website'] = df['text'].apply(lambda x : website(x))

In [261]:
x_features = pd.concat([df['len'],df['long_number'],df['punct'],df['website'],pd.DataFrame(vector_output.toarray())],axis=1)

In [264]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score

x_train, x_test, y_train, y_test = train_test_split(x_features,df['label'])
rf = RandomForestClassifier(n_estimators=100,max_depth=None,n_jobs=-1)
rf_model = rf.fit(x_train,y_train)
sorted(zip(rf_model.feature_importances_,x_train.columns),reverse=True)[0:10]

[(0.0663983964291311, 'long_number'),
 (0.03513170729545177, 'len'),
 (0.028647347861842486, 8514),
 (0.027537843805422076, 2208),
 (0.025376108828532428, 1932),
 (0.01944541435765632, 3555),
 (0.012696884982000707, 5471),
 (0.012604581945657198, 7805),
 (0.012492501311375826, 6540),
 (0.012094708202485586, 6880)]

In [266]:
y_pred=rf_model.predict(x_test)
precision,recall,fscore,support =score(y_test,y_pred,pos_label='spam', average ='binary')
print('Precision : {} / Recall : {} / fscore : {} / Accuracy: {}'.format(round(precision,6),round(recall,6),round(fscore,6),round((y_pred==y_test).sum()/len(y_test),6)))

Precision : 1.0 / Recall : 0.842391 / fscore : 0.914454 / Accuracy: 0.979182


# TextCNN

In [21]:
import torch
import torch.nn as nn
from torch.utils import data
from torchsummary import summary
from utility_torch import Timer, MyDataset

from IPython import display

## BaseLine

In [112]:
class BaseModel(nn.Module):
    def __init__(self,**kwargs):
        super(BaseModel,self).__init__(**kwargs)
        self.Flatten = nn.Flatten()
        self.FC = nn.Sequential(
            nn.Linear(in_features=100*35,out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128,out_features=32),
            nn.ReLU(),
            nn.Linear(in_features=32,out_features=1)
        )
        
        self.sigmoid = nn.Sigmoid() 
        
    def forward(self, X):
        # concat + fully connected
        X = self.Flatten(X)
        output = self.FC(X)
        output = self.sigmoid(output)
        return output

In [23]:
Net = BaseModel()
summary(Net,input_size=(1,35,100))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                 [-1, 3500]               0
            Linear-2                  [-1, 128]         448,128
              ReLU-3                  [-1, 128]               0
            Linear-4                   [-1, 32]           4,128
              ReLU-5                   [-1, 32]               0
            Linear-6                    [-1, 1]              33
           Sigmoid-7                    [-1, 1]               0
Total params: 452,289
Trainable params: 452,289
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (MB): 0.03
Params size (MB): 1.73
Estimated Total Size (MB): 1.77
----------------------------------------------------------------


## Conv1D Block

In [24]:
class Conv1dBlock(nn.Module):
    """1-dimension convolution"""
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, **kwargs):
        super(Conv1dBlock, self).__init__(**kwargs)
        self.conv1d = nn.Conv1d(in_channels,out_channels,kernel_size, stride, padding)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveMaxPool1d(1)
        
    def forward(self, X):
        # batch_size seq_len embedding_size -> batch_size embedding_size seq_len
        X = X.transpose(2,1)
        X = self.conv1d(X)
        X = self.relu(X)
        output = torch.squeeze(self.pool(X),axis=-1) 
        return output

In [25]:
Net = Conv1dBlock(in_channels=100, out_channels=16, kernel_size=2, stride=1, padding="valid")
summary(Net, input_size=(35,100))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv1d-1               [-1, 16, 34]           3,216
              ReLU-2               [-1, 16, 34]               0
 AdaptiveMaxPool1d-3                [-1, 16, 1]               0
Total params: 3,216
Trainable params: 3,216
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (MB): 0.01
Params size (MB): 0.01
Estimated Total Size (MB): 0.03
----------------------------------------------------------------


## TextCNN

In [26]:
class TextCNN(nn.Module):
    """TEXTCNN"""
    def __init__(self, embedding_dim, num_channels, kernel_sizes, **kwargs):
        super(TextCNN, self).__init__(**kwargs)
        self.convs = nn.Sequential()
        for idx, (c, k) in enumerate(zip(num_channels, kernel_sizes)):
            self.convs.add_module(f'conv{idx+1}', 
                                  Conv1dBlock(in_channels=embedding_dim, out_channels=c, kernel_size=k,
                                              stride=1, padding="valid")
                                )
        self.dropout = nn.Dropout(0.5)
        self.FC = nn.Sequential(
            nn.Linear(in_features=c*len(kernel_sizes),out_features=32),
            nn.ReLU(),
            nn.Linear(in_features=32,out_features=16),
            nn.ReLU(),
            nn.Linear(in_features=16,out_features=1)
        )
        self.sigmoid = nn.Sigmoid() 
        
    def forward(self, X):
        encoding = torch.cat([conv(X) for conv in self.convs],
                             axis=1)
        output = self.dropout(self.FC(encoding))
        output = self.sigmoid(output)
        return output

In [27]:
Net = TextCNN(embedding_dim=100, num_channels=[16, 16, 16, 16], kernel_sizes=[2, 5, 10, 20])
summary(Net, input_size=(35,100))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv1d-1               [-1, 16, 34]           3,216
              ReLU-2               [-1, 16, 34]               0
 AdaptiveMaxPool1d-3                [-1, 16, 1]               0
       Conv1dBlock-4                   [-1, 16]               0
            Conv1d-5               [-1, 16, 31]           8,016
              ReLU-6               [-1, 16, 31]               0
 AdaptiveMaxPool1d-7                [-1, 16, 1]               0
       Conv1dBlock-8                   [-1, 16]               0
            Conv1d-9               [-1, 16, 26]          16,016
             ReLU-10               [-1, 16, 26]               0
AdaptiveMaxPool1d-11                [-1, 16, 1]               0
      Conv1dBlock-12                   [-1, 16]               0
           Conv1d-13               [-1, 16, 16]          32,016
             ReLU-14               [-1,

# LSTM

## LSTM block

In [28]:
class LSTMBlock(nn.Module):
    def __init__(self,embedding_dim, hidden_size, num_layers, bi, **kwargs):
        super(LSTMBlock, self).__init__(**kwargs)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, 
                            num_layers=num_layers, 
                            batch_first=True,bidirectional=bi)
        
    def forward(self, X):
        # output: batch_size,seq_len,hidden_size*num_directions
        # h_n, c_n: num_layers*num_directions,batch_size,hidden_size
        output, (h_n, c_n) = self.lstm(X)  
        #print(output.shape,h_n.shape,c_n.shape)
        #torch.cat([output[:,0,:],output[:,-1,:]],axis=1)
        return h_n

In [39]:
Net = LSTMBlock(embedding_dim=100, hidden_size=16, num_layers=2, bi=False)
_input = torch.randn(size=(32,35,100))
_output = Net(_input)
print(_output.shape)

torch.Size([2, 32, 16])


## LSTM + FC

In [43]:
class LSTM_FC(nn.Module):
    def __init__(self, embedding_dim, hidden_size, num_layers, bi=False, **kwargs):
        super(LSTM_FC,self).__init__(**kwargs)
        self.lstmblock = LSTMBlock(embedding_dim=embedding_dim, 
                                   hidden_size=hidden_size, 
                                   num_layers=num_layers, bi=bi)
        self.Flatten = nn.Flatten()
        self.FC = nn.Sequential(
            nn.Linear(in_features=128,out_features=32),
            nn.Linear(in_features=32,out_features=1)
        )
        
        self.sigmoid = nn.Sigmoid() 
    
    def forward(self, X):
        X = self.lstmblock(X)
        # num_layers*num_directions,batch_size,hidden_size --> 
        # batch_size, num_layers*num_directions, hidden_size
        X = X.transpose(1,0)  
        X = self.Flatten(X)
        output = self.FC(X)
        output = self.sigmoid(output)
        return output

In [44]:
Net = LSTM_FC(100, 64, 2)

_input = torch.randn(size=(32,35,100))
_output = Net(_input)
print(_output.shape)

torch.Size([32, 1])


# Train

## metrics

In [45]:
def vanila_acc(y_hat, y, threshold):
    """Compute the number of correct predictions."""
    y_pred = torch.zeros(size=y_hat.shape,dtype=y_hat.dtype)
    y_pred[y_hat > threshold] = 1
    cmp = y_pred == y
    return float(torch.sum(cmp))/y.numel()

In [46]:
#sanity check
y_hat = torch.tensor([[0.57],[0.38],[0.49],[0.55],[0.40]])
y = torch.tensor([[1],[0],[0],[0],[0]])
vanila_acc(y_hat,y,threshold=0.5)

0.8

In [47]:
def evaluate(net, data_iter, loss, acc_func, threshold):
    """Compute the accuracy for a model on a dataset"""
    net.eval()  # Set the model to evaluation mode
    metric = [0]*4 # loss acc recall, batch_num
    with torch.no_grad(): # stop updating the model
        for i, (X, y) in enumerate(data_iter):
            y_hat = net(X.type(torch.float32))
            metric[0] += loss(y_hat,y.type(torch.float32))
            metric[1] += acc_func(y_hat, y.type(torch.float32),threshold=threshold)
            metric[2] += torch.sum(torch.mul(y_hat, y.type(torch.float32))) / torch.sum(y_hat)
            metric[3] += 1
    return metric[0]/metric[3], metric[1]/metric[3], metric[2]/metric[3]

## main function for training

In [48]:
def train_model(net, train_iter, val_iter, num_epochs, lr, patience, savepath):
    def init_weights(m):
        if type(m) == nn.Linear or type(m) == nn.Conv1d or type(m) == nn.Conv2d:
            torch.nn.init.xavier_uniform_(m.weight)
    
    net.apply(init_weights)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr, betas=(0.9, 0.999))
    loss = nn.BCELoss()
    since_last_best, best_val = 0, (float('-inf'),0)
    
    for epoch in range(num_epochs):
        timer = Timer() # record time each epoch
        metric = [0]*5 # loss acc recall batch_num
        net.train()
        for i, (X, y) in enumerate(train_iter):
            print(f'{i+1}/{len(train_iter)}'+'\r',end='')
            timer.start()
            optimizer.zero_grad()
            y_hat = net(X.type(torch.float32))
            l = loss(y_hat, y.type(torch.float32))
            l.backward()
            optimizer.step()
            # evaluate on train_iter
            with torch.no_grad():
                metric[0] += l
                metric[1] += vanila_acc(y_hat,y,threshold=0.5)
                metric[2] += torch.sum(torch.mul(y_hat, y.type(torch.float32))) / torch.sum(y_hat)
                metric[3] += 1  # tally batch num
                metric[4] += X.shape[0]  # tally observation
            timer.stop()
        
        # evaluate on val_iter
        val_l, val_acc, val_recall = evaluate(net, val_iter, loss=loss, acc_func=vanila_acc, threshold=0.5)
        val_f1 = 2 / (1/val_acc + 1/val_recall)
        
        # early stopping
        if val_f1 > best_val[0]: 
            best_val = (val_f1, epoch+1)
            since_last_best = 0
            torch.save(net.state_dict(), savepath) # Save model
        else:
            since_last_best +=1
            if since_last_best > patience:
                print(f'early stopping! best_val:{best_val[0]:.3f} epoch{best_val[1]}')
                break
        
        # on_epoch_end
        print(f'epoch{epoch+1} --> ',
              f'{metric[4] / timer.sum():.1f} examples/sec',
              f'train_loss {metric[0]/metric[3]:.3f} train acc {metric[1]/metric[3]:.3f} train recall {metric[2]/metric[3]:.3f}',
              f'val_loss {val_l:.3f} val acc {val_acc:.3f} val recall {val_recall:.3f} ')
    return net

## load data and train model

In [49]:
batch_size, num_epochs, lr, seed, patience = 32, 100, 0.001, 42, 5

In [50]:
train_idx_NN, val_idx_NN = train_test_split(np.arange(X_train.shape[0]), test_size=0.2, random_state=42, 
                                            shuffle=True, stratify=Y_train)

train_iter = data.DataLoader(MyDataset(X_train[train_idx_NN],Y_train[train_idx_NN]),
                             batch_size, shuffle=True,num_workers=0)
val_iter = data.DataLoader(MyDataset(X_train[val_idx_NN],Y_train[val_idx_NN]), 
                           batch_size, shuffle=False,num_workers=0)
test_iter = data.DataLoader(MyDataset(X_test,Y_test), batch_size, shuffle=False,num_workers=0)

for train, val, test in zip(train_iter, val_iter, test_iter):
    print(train[0].shape,val[0].shape,test[0].shape)
    print(train[1].shape,val[1].shape, test[1].shape)
    print(type(train[0]))
    print(collections.Counter(train[1].numpy().ravel()))
    break

torch.Size([32, 35, 100]) torch.Size([32, 35, 100]) torch.Size([32, 35, 100])
torch.Size([32, 1]) torch.Size([32, 1]) torch.Size([32, 1])
<class 'torch.Tensor'>
Counter({0: 28, 1: 4})


### BaseLine

In [113]:
savepath = "FC.pth"

In [114]:
Net = BaseModel()
trained_net = train_model(Net, train_iter, val_iter, num_epochs, lr, patience,
                          savepath=savepath)

epoch1 -->  4429.2 examples/sec train_loss 0.447 train acc 0.865 train recall 0.159 val_loss 0.315 val acc 0.866 val recall 0.233 
epoch2 -->  4660.1 examples/sec train_loss 0.222 train acc 0.907 train recall 0.407 val_loss 0.182 val acc 0.941 val recall 0.575 
epoch3 -->  4594.3 examples/sec train_loss 0.108 train acc 0.970 train recall 0.696 val_loss 0.136 val acc 0.959 val recall 0.739 
epoch4 -->  4753.4 examples/sec train_loss 0.062 train acc 0.984 train recall 0.825 val_loss 0.131 val acc 0.963 val recall 0.774 
epoch5 -->  4684.5 examples/sec train_loss 0.040 train acc 0.990 train recall 0.876 val_loss 0.132 val acc 0.962 val recall 0.795 
epoch6 -->  4594.5 examples/sec train_loss 0.027 train acc 0.993 train recall 0.910 val_loss 0.134 val acc 0.963 val recall 0.793 
epoch7 -->  3954.3 examples/sec train_loss 0.019 train acc 0.995 train recall 0.932 val_loss 0.138 val acc 0.963 val recall 0.806 
epoch8 -->  3847.8 examples/sec train_loss 0.011 train acc 0.998 train recall 0.942

### TextCNN

In [70]:
savepath = "textcnn.pth"
embedding_dim, num_channels, kernel_sizes = 100, [16,16,16,16], [2, 5, 10, 20]

In [71]:
Net = TextCNN(embedding_dim, num_channels, kernel_sizes)
trained_net = train_model(Net, train_iter, val_iter, num_epochs, lr, patience,
                         savepath=savepath)

epoch1 -->  2533.4 examples/sec train_loss 0.554 train acc 0.867 train recall 0.136 val_loss 0.443 val acc 0.866 val recall 0.141 
epoch2 -->  2529.8 examples/sec train_loss 0.500 train acc 0.866 train recall 0.156 val_loss 0.337 val acc 0.886 val recall 0.227 
epoch3 -->  2569.8 examples/sec train_loss 0.429 train acc 0.907 train recall 0.227 val_loss 0.206 val acc 0.951 val recall 0.417 
epoch4 -->  2790.5 examples/sec train_loss 0.398 train acc 0.919 train recall 0.262 val_loss 0.174 val acc 0.960 val recall 0.493 
epoch5 -->  2614.9 examples/sec train_loss 0.390 train acc 0.922 train recall 0.273 val_loss 0.180 val acc 0.959 val recall 0.473 
epoch6 -->  2724.5 examples/sec train_loss 0.397 train acc 0.921 train recall 0.269 val_loss 0.149 val acc 0.963 val recall 0.569 
epoch7 -->  1690.9 examples/sec train_loss 0.380 train acc 0.923 train recall 0.282 val_loss 0.162 val acc 0.963 val recall 0.511 
epoch8 -->  1610.2 examples/sec train_loss 0.375 train acc 0.925 train recall 0.287

### LSTM

In [66]:
savepath = "lstm.pth"
embedding_dim, hidden_size, num_layers = 100, 64, 2

In [53]:
Net = LSTM_FC(embedding_dim=embedding_dim, hidden_size=hidden_size, num_layers=num_layers)
trained_net = train_model(Net, train_iter, val_iter, num_epochs, lr, patience,
                          savepath=savepath)

epoch1 -->  787.0 examples/sec train_loss 0.408 train acc 0.859 train recall 0.134 val_loss 0.396 val acc 0.866 val recall 0.135 
epoch2 -->  747.4 examples/sec train_loss 0.395 train acc 0.867 train recall 0.134 val_loss 0.394 val acc 0.866 val recall 0.135 
epoch3 -->  577.6 examples/sec train_loss 0.396 train acc 0.866 train recall 0.135 val_loss 0.394 val acc 0.866 val recall 0.135 
epoch4 -->  573.4 examples/sec train_loss 0.394 train acc 0.866 train recall 0.136 val_loss 0.403 val acc 0.866 val recall 0.135 
epoch5 -->  579.9 examples/sec train_loss 0.390 train acc 0.866 train recall 0.143 val_loss 0.377 val acc 0.866 val recall 0.151 
epoch6 -->  580.7 examples/sec train_loss 0.245 train acc 0.919 train recall 0.503 val_loss 0.234 val acc 0.923 val recall 0.561 
epoch7 -->  585.3 examples/sec train_loss 0.180 train acc 0.942 train recall 0.622 val_loss 0.200 val acc 0.933 val recall 0.680 
epoch8 -->  555.3 examples/sec train_loss 0.163 train acc 0.948 train recall 0.688 val_los

# out-of-sample test

In [115]:
# retsore the model(with best performance on validation set)

trained_net = BaseModel()
#trained_net = TextCNN(embedding_dim, num_channels, kernel_sizes)
#trained_net = LSTM_FC(embedding_dim, hidden_size, num_layers)

trained_net.load_state_dict(torch.load('FC.pth'))
trained_net.eval();

In [116]:
_, test_acc, test_recall = evaluate(trained_net, test_iter, loss=nn.BCELoss(), acc_func=vanila_acc, threshold=0.5)
test_acc, test_recall

(0.9669642857142857, tensor(0.9183))

In [117]:
with torch.no_grad():
    test_predictions = trained_net(torch.tensor(X_test, dtype=torch.float32)).detach().numpy().ravel()
test_predictions[test_predictions>=0.5] = 1.0
test_predictions[test_predictions<0.5] = 0.0
len(test_predictions)

1115

In [118]:
print(classification_report(Y_test, test_predictions))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       966
           1       0.94      0.81      0.87       149

    accuracy                           0.97      1115
   macro avg       0.95      0.90      0.92      1115
weighted avg       0.97      0.97      0.97      1115



# Stacking

## wrapper for model based on pytorch

In [72]:
from abc import ABCMeta
from sklearn.base import BaseEstimator, ClassifierMixin

In [73]:
class PytorchWrapper(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
    def __init__(self, net, shape, path):
        self.net = net
        self.shape = shape # how to reshape X from 2D->3D
        self.path = path  # path of pre-trained model
        
    def fit(self, X, y):
        # detech different classes
        self.classes_ = np.argsort(np.argsort(np.unique(y)))
        # laod pre-trained model
        self.net.load_state_dict(torch.load(self.path))
        self.net.eval();
        return self
    
    def predict(self, X):
        probas = self.predict_proba(X)
        return self.classes_[np.argmax(probas, axis=1)].ravel()

    def predict_proba(self, X):
        X_ = X.reshape(X.shape[0], self.shape[0], self.shape[1])
        probas_pos = self.net(torch.tensor(X_, dtype=torch.float32)).detach().numpy().ravel()
        probas_neg = 1 - probas_pos
        return np.c_[probas_neg, probas_pos]

### Sanity Check

In [74]:
Net = TextCNN(embedding_dim, num_channels, kernel_sizes)
textcnn = PytorchWrapper(Net, (35, 100), path="textcnn.pth")
textcnn.fit(X_train.reshape(X_train.shape[0], -1), Y_train.ravel());
print(classification_report(Y_test, textcnn.predict(X_test.reshape(X_test.shape[0], -1))))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       966
           1       0.96      0.87      0.91       149

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [76]:
Net = LSTM_FC(embedding_dim, hidden_size, num_layers)
textcnn = PytorchWrapper(Net, (35, 100), path="lstm.pth")
textcnn.fit(X_train.reshape(X_train.shape[0], -1), Y_train.ravel());
print(classification_report(Y_test, textcnn.predict(X_test.reshape(X_test.shape[0], -1))))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97       966
           1       0.92      0.73      0.81       149

    accuracy                           0.96      1115
   macro avg       0.94      0.86      0.89      1115
weighted avg       0.95      0.96      0.95      1115



## Stack Classifier

In [77]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier

In [82]:
estimators = [
    ('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
    ('svr', SVC(kernel='rbf', random_state=42)),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)),
    ('textcnn', PytorchWrapper(TextCNN(100,  [16,16,16,16], [2, 5, 10, 20]), (35, 100), "textcnn.pth")),
    ('lstm', PytorchWrapper(LSTM_FC(100, 64, 2), (35, 100), "lstm.pth")),
]

clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(),
    verbose=1
)

In [83]:
clf.fit(X_train.reshape(X_train.shape[0],-1), Y_train.ravel());
clf_predictions = clf.predict(X_test.reshape(X_test.shape[0],-1))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   35.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.4min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.9s finished


In [84]:
print(classification_report(Y_test, clf_predictions))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       966
           1       0.96      0.87      0.91       149

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



## Performance Analysis

In [90]:
clf.final_estimator_.coef_

array([[ 0.48764465,  2.22427691,  0.37060004,  7.34373822, -0.40859267]])

In [95]:
rf = RandomForestClassifier(n_estimators=50, random_state=42)
rf.fit(X_train.reshape(X_train.shape[0],-1), Y_train.ravel())
rf_pred = rf.predict(X_test.reshape(X_test.shape[0],-1))

svc = SVC(kernel='rbf', random_state=42)
svc.fit(X_train.reshape(X_train.shape[0],-1), Y_train.ravel())
svc_pred = svc.predict(X_test.reshape(X_test.shape[0],-1))

xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
xgb.fit(X_train.reshape(X_train.shape[0],-1), Y_train.ravel())
xgb_pred = xgb.predict(X_test.reshape(X_test.shape[0],-1))

In [91]:
trained_net = TextCNN(embedding_dim, num_channels, kernel_sizes)
trained_net.load_state_dict(torch.load('textcnn.pth'))
trained_net.eval();
with torch.no_grad():
    textcnn_pred = trained_net(torch.tensor(X_test, dtype=torch.float32)).detach().numpy().ravel()
textcnn_pred[textcnn_pred>=0.5] = 1.0
textcnn_pred[textcnn_pred<0.5] = 0.0

trained_net = LSTM_FC(embedding_dim, hidden_size, num_layers)
trained_net.load_state_dict(torch.load('lstm.pth'))
trained_net.eval();
with torch.no_grad():
    lstm_pred = trained_net(torch.tensor(X_test, dtype=torch.float32)).detach().numpy().ravel()
lstm_pred[lstm_pred>=0.5] = 1.0
lstm_pred[lstm_pred<0.5] = 0.0

In [109]:
def pred_corr(pred1, pred2):
    print(f"pos prediction for model1: {np.sum(pred1)/pred1.shape[0]:.4f}")
    print(f"pos prediction for model2: {np.sum(pred2)/pred2.shape[0]:.4f}")
    print(f"pos prediction for both model1 & model2: {np.sum(pred1+pred2 == 2)/pred1.shape[0]:.4f}")
    return 

In [111]:
pred_corr(rf_pred, textcnn_pred)
pred_corr(svc_pred, textcnn_pred)
pred_corr(xgb_pred, textcnn_pred)
pred_corr(lstm_pred, textcnn_pred)

pos prediction for model1: 0.0780
pos prediction for model2: 0.1211
pos prediction for both model1 & model2: 0.0780
pos prediction for model1: 0.0834
pos prediction for model2: 0.1211
pos prediction for both model1 & model2: 0.0834
pos prediction for model1: 0.0942
pos prediction for model2: 0.1211
pos prediction for both model1 & model2: 0.0915
pos prediction for model1: 0.1067
pos prediction for model2: 0.1211
pos prediction for both model1 & model2: 0.1013
