In [0]:
!kill -9 -1

In [1]:
import torch

torch.cuda.empty_cache()

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("There are {} GPUs available.".format(torch.cuda.device_count()))
    print("We will use GPU {}".format(torch.cuda.get_device_name(0)))
else:
    print("There is no GPU available, using the CPU instead!")
    device = torch.device("cpu")

There are 1 GPUs available.
We will use GPU Tesla K80


In [2]:
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Collecting gputil
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-cp36-none-any.whl size=7413 sha256=9f676597269e6aefd9dca37c3300dd1c7624d4b2148fda22e8341ce0a2d05415
  Stored in directory: /root/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0
Gen RAM Free: 12.7 GB  | Proc size: 281.2 MB
GPU RAM Free: 11430MB | Used: 11MB | Util   0% | Total 11441MB


In [3]:
#Testing the change
import pandas as pd
train_with_1 = pd.read_csv('modified_train.csv')
train_with_1.head()

Unnamed: 0,Id,date,user,text,target
0,0,Fri Jun 05 22:04:23 PDT 2009,JGoldsborough,"@jbtaylor WIth ya. &quot;I'd like a Palm Pre, ...",1.0
1,1,Sat Jun 06 03:12:21 PDT 2009,Psioui,"felt the earthquake this afternoon, it seems t...",1.0
2,2,Sat May 30 19:02:49 PDT 2009,adriville,"Ruffles on shirts are like so in, me Likey",1.0
3,3,Thu Jun 25 05:59:18 PDT 2009,Blondie128,Pretty bad night into a crappy morning....FML!...,0.0
4,4,Sat May 30 11:16:35 PDT 2009,khrabrov,"@dcbriccetti yeah, what a clear view!",1.0


In [0]:
#Defining the generate bigrams method for the Fast_Text class
def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x

In [0]:
#Getting the relevant imports and the fields for reading training data
import torch
from torchtext import data
from torchtext import datasets
import random
import pandas as pd
import numpy as np

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(preprocessing = generate_bigrams)
TARGET = data.LabelField(dtype = torch.float)

In [0]:
#Defining the fields for reading train.csv
fields_train = [(None, None), (None, None), (None, None), ('text', TEXT),('target', TARGET)]

In [0]:
#Reading train.csv
train_data = data.TabularDataset(path = 'modified_train.csv',
                                 format = 'csv',
                                 fields = fields_train,
                                 skip_header = True
)

In [0]:
#Creating validation set
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [9]:
#Getting the pre-trained word embeddings and building the vocab
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

TARGET.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [06:30, 2.21MB/s]                           
100%|█████████▉| 399951/400000 [00:21<00:00, 17045.34it/s]

In [0]:
#defining the Fast_Text Class
import torch.nn as nn
import torch.nn.functional as F

class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, text):
        
        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
                
        #embedded = [sent len, batch size, emb dim]
        
        embedded = embedded.permute(1, 0, 2)
        
        #embedded = [batch size, sent len, emb dim]
        
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) 
        
        #pooled = [batch size, embedding_dim]
                
        return self.fc(pooled) 

In [0]:
# defining our models and the relevant parameters
model = FastText(25002, 100, 1, 1)

In [12]:
import torch

model.load_state_dict(torch.load("tut3-model.pt"))
model.to(device)

FastText(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (fc): Linear(in_features=100, out_features=1, bias=True)
)

In [0]:
#Inference method
def predict_sentiment(model, sentence):
    model.eval()
    tokenized = generate_bigrams(sentence.split())
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [0]:
#defining the accuracy calculation method
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [15]:
#Running inference on test
preds = []

test_data = pd.read_csv('tweet_test.csv')

# print(test_data["text"][0])
for i in range(len(test_data)):
    preds.append((int(predict_sentiment(model, str(test_data["text"][i]))>0.5)))
preds = torch.FloatTensor(preds)
labels = torch.tensor(test_data["label"])
# ids = test_data['Id']
# dict = {'Id': ids, 'target': preds}
# df = pd.DataFrame(dict) 
acc = binary_accuracy(preds, labels)

print(f'\tThe Test. Acc is: {acc*100:.2f}%')

	The Test. Acc is: 81.20%
