In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir("/content/drive/My Drive")

In [None]:
import torch
from torchtext import data
import spacy
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("new_data.csv",index_col=0)

In [None]:
df.loc[1000,"content"]

"i 've lost all hope day to day for the past year i have been thinking of suicide but never really started considering it a an option until recently i do n't know when or if i ever will kill myself but if my life keep heading in this direction then i will have no choice i sit here still procrastinating from overdue assignment that i would n't have done anyways i 've lived a lie for too long and i 'm tired of it i just want it to end now i hate feeling this way when i should be doing something with my life it 's only gon na get worse when i start highschool next year"

In [None]:
df = pd.read_csv("new_data_nostopword.csv",index_col=0)

In [None]:
df["content"][400]

"need way get going slowly working car mod game exam etc stopped feel hard get stuff done 'm procrastinating hard way get back"

# Baseline Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
train_df, test_df = train_test_split(df,test_size = 0.2, random_state=42,shuffle = True)
train_df, valid_df = train_test_split(train_df,test_size=0.05, random_state=42,shuffle = True)
train_df.to_csv("new_train.csv",index = False)
valid_df.to_csv("new_valid.csv",index = False)
test_df.to_csv("new_test.csv",index = False)

In [None]:
X_train, X_valid, X_test = train_df["content"], valid_df["content"], test_df["content"]
y_train, y_valid, y_test = train_df["label"], valid_df["label"], test_df["label"]

In [None]:
np.save("true_label_LR",y_test)

Data Engineered with One-Hot Method

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)
train_onehot = vectorizer.transform(X_train)
test_onehot = vectorizer.transform(X_test)
LR = LogisticRegression(max_iter = 5000,penalty='l2',C=1)
LR.fit(train_onehot, y_train)
y_pred_LR0 = LR.predict(test_onehot)
print("Accuracy on test set is",(y_pred_LR0 == y_test).sum()/len(y_test))
np.save("y_pred_LR0", y_pred_LR0)

Accuracy on test set is 0.8751733703190014


Engineering with TF-IDF Method

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)
train_tfidf = vectorizer.transform(X_train)
test_tfidf = vectorizer.transform(X_test)
LR = LogisticRegression(max_iter = 5000,penalty='l2',C=1)
LR.fit(train_tfidf, y_train)
y_pred_LR1 = LR.predict(test_tfidf)
print("Accuracy on test set is",(y_pred_LR1 == y_test).sum()/len(y_test))
np.save("y_pred_LR1", y_pred_LR1)

Accuracy on test set is 0.8862690707350902


# CNN

In [None]:
SEED = 42

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
TEXT = data.Field(tokenize = 'spacy', batch_first=True)
LABEL = data.LabelField(dtype = torch.float)

In [None]:
fields = [('text',TEXT), ('label',LABEL)]
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = "/content/drive/My Drive/",
                                        train = 'new_train.csv',
                                        validation = "new_valid.csv",
                                        test = 'new_test.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True)

In [None]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = 'glove.6B.200d',
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [None]:
BATCH_SIZE = 100

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    sort_key = lambda x: len(x.text),
    device = device)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
                
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim), padding = (fs//2, 0)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
 
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 200
N_FILTERS = 100
FILTER_SIZES = [2, 3, 4]
OUTPUT_DIM = 1
DROPOUT = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    prediction_list = []
    true_label= []
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            rounded_preds = torch.round(torch.sigmoid(predictions))
            prediction_list.append(rounded_preds.tolist())
            true_label.append(batch.label.tolist())
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator),np.array([int(i) for batch in prediction_list for i in batch]), np.array([int(i) for batch in true_label for i in batch])

In [None]:
N_EPOCHS = 6

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc,_,_ = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 0s
	Train Loss: 0.047 | Train Acc: 99.51%
	 Val. Loss: 0.254 |  Val. Acc: 89.71%
Epoch: 02 | Epoch Time: 0m 0s
	Train Loss: 0.035 | Train Acc: 99.67%
	 Val. Loss: 0.262 |  Val. Acc: 90.04%
Epoch: 03 | Epoch Time: 0m 0s
	Train Loss: 0.026 | Train Acc: 99.76%
	 Val. Loss: 0.286 |  Val. Acc: 89.25%
Epoch: 04 | Epoch Time: 0m 0s
	Train Loss: 0.018 | Train Acc: 99.98%
	 Val. Loss: 0.281 |  Val. Acc: 89.63%
Epoch: 05 | Epoch Time: 0m 0s
	Train Loss: 0.016 | Train Acc: 99.87%
	 Val. Loss: 0.271 |  Val. Acc: 90.42%
Epoch: 06 | Epoch Time: 0m 0s
	Train Loss: 0.013 | Train Acc: 99.91%
	 Val. Loss: 0.279 |  Val. Acc: 90.75%


In [None]:
loss,acc, y_pred_CNN, true_label_CNN = evaluate(model, test_iterator,criterion)
print("Wth CNN, accuracy on the test set is", acc)

Wth CNN, accuracy on the test set is 0.9053333163261413


In [None]:
np.save("y_pred_CNN", y_pred_CNN)
np.save("true_label_CNN", true_label_CNN)