In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split


df = pd.read_csv("/home/csgrad/sunilruf/int_point/code/sentiment-analysis/EcoPreprocessed.csv")

df.head()



Unnamed: 0,3870,able play youtube alexa,0.5,positive
0,62,able recognize indian accent really well drop ...,0.2794,positive
1,487,absolute smart device amazon connect external ...,0.1827,positive
2,3204,absolutely amaze new member family control hom...,0.3682,positive
3,1265,absolutely amaze previously sceptical invest m...,0.2333,positive
4,53,absolutely cheat customer if buy amazon produc...,0.135,positive


In [40]:
data = df['able play youtube alexa'].to_list()
labels = df['0.5']
labels = [1 if label> 0 else 0 for label in labels]
print(set(labels))


{0, 1}


In [4]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [24]:
encode = (tokenizer.encode_plus(data[0], truncation=True, return_tensors="pt", max_length=128, pad_to_max_length=True,  add_special_tokens=True, return_attention_mask=True, return_token_type_ids=False))



In [33]:
model(encode['input_ids'], encode['attention_mask']).last_hidden_state

tensor([[[-0.5532, -0.1086,  0.1753,  ..., -0.6397,  0.0961,  0.7571],
         [-0.4816,  0.1920,  0.5954,  ..., -0.5952,  0.4091,  0.1240],
         [-0.4712,  0.5855,  0.3801,  ..., -0.8300, -0.3521,  0.0086],
         ...,
         [-0.0133, -0.1829,  0.4134,  ..., -0.3031, -0.2251,  0.2962],
         [-0.1936, -0.3222,  0.1449,  ..., -0.2284, -0.0152,  0.1049],
         [ 0.0478, -0.2255,  0.3077,  ..., -0.1250,  0.0764,  0.2384]]],
       grad_fn=<NativeLayerNormBackward0>)

In [None]:
model(encode['input_ids'], encode['attention_mask'])[0][:, 0, :]

In [36]:
def generate_bert_embeddings(data):
    with torch.no_grad():
        encodings = tokenizer.encode_plus(data, truncation=True, return_tensors="pt", max_length=128, pad_to_max_length=True, return_attention_mask=True)
        input_ids = encodings['input_ids']
        attention_mask = encodings['attention_mask']
        
        outputs = model(input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state[:, 0, :]
        return embeddings

In [44]:
from tqdm import tqdm

tqdm.pandas(desc="Generating Embeddings")
df['embeddings'] = df['able play youtube alexa'].progress_apply(lambda x: generate_bert_embeddings(x))

Generating Embeddings: 100%|██████████| 4083/4083 [02:01<00:00, 33.68it/s]


In [45]:
df 

Unnamed: 0,3870,able play youtube alexa,0.5,positive,embeddings
0,62,able recognize indian accent really well drop ...,0.2794,positive,"[[tensor(-0.5532), tensor(-0.1086), tensor(0.1..."
1,487,absolute smart device amazon connect external ...,0.1827,positive,"[[tensor(-0.1641), tensor(0.1520), tensor(0.13..."
2,3204,absolutely amaze new member family control hom...,0.3682,positive,"[[tensor(-0.2341), tensor(-0.1576), tensor(0.1..."
3,1265,absolutely amaze previously sceptical invest m...,0.2333,positive,"[[tensor(-0.0489), tensor(0.3692), tensor(-0.2..."
4,53,absolutely cheat customer if buy amazon produc...,0.1350,positive,"[[tensor(-0.2314), tensor(-0.1587), tensor(0.1..."
...,...,...,...,...,...
4078,852,yo yo yo love go if want one smart speaker val...,0.4571,positive,"[[tensor(-0.3128), tensor(0.1881), tensor(0.05..."
4079,2163,youtube music,0.0000,neutral,"[[tensor(0.2363), tensor(-0.1207), tensor(0.04..."
4080,2488,youtube support nahi kartasong recognise achha...,0.0000,neutral,"[[tensor(-0.5111), tensor(0.0448), tensor(-0.0..."
4081,651,yup proscontrols wipro light amazinglysony bra...,0.0000,neutral,"[[tensor(-0.3297), tensor(-0.0442), tensor(-0...."


In [None]:
encodings = df['embeddings'].to_list()
print(encodings[0])

In [51]:
tensors = torch.stack(encodings)


In [53]:
print((tensors.shape))

torch.Size([4083, 1, 768])


In [120]:
class BertClassifier(nn.Module):
    
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BertClassifier, self).__init__()
        
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.softmax = nn.Sigmoid()
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [132]:
classifier = BertClassifier(768, 100, 2)
print(classifier)

BertClassifier(
  (fc1): Linear(in_features=768, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=2, bias=True)
  (relu): ReLU()
  (softmax): Sigmoid()
)


In [133]:
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler

X_train, X_test, y_train, y_test = train_test_split(tensors, labels, test_size=0.2, random_state=42)

train_data = TensorDataset(X_train, torch.tensor(y_train))
test_data = TensorDataset(X_test, torch.tensor(y_test))
train_sampler = RandomSampler(train_data)
test_sampler = SequentialSampler(test_data)

train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=32)

In [136]:
optimizer = torch.optim.Adam(classifier.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

def train(dataloader, model, loss_fn, optimizer):
    model.train()
    size = len(dataloader.dataset)
    for i in range(10):
        for batch, (X, y) in enumerate(dataloader):
            X = X.view(X.size(0), -1)
            pred = model(X)
            print(pred.shape, y.shape)
            y = y.float()
            loss = loss_fn(pred, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if batch % 100 == 0:
                loss, current = loss.item(), batch * len(X)
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [137]:
train(train_dataloader, classifier, loss_fn, optimizer) 

torch.Size([32, 2]) torch.Size([32])


  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: The size of tensor a (2) must match the size of tensor b (32) at non-singleton dimension 1

In [105]:
def test(model, dataloader, loss_fn, optimizer):
    model.eval()
    size = len(dataloader.dataset)
    
    for X, y in dataloader:
        print(X.shape)
        X = X.view(X.size(0), -1)
        #print(X.shape)
        pred = model(X)
        #print(pred)
        pred = (torch.argmax(pred,1))
        print(len(y))
        accuracy = (pred == y).sum().item()
        print((accuracy/len(y))*100)

In [106]:
test(classifier, test_dataloader, loss_fn, optimizer)

torch.Size([32, 1, 768])
32
81.25
torch.Size([32, 1, 768])
32
84.375
torch.Size([32, 1, 768])
32
87.5
torch.Size([32, 1, 768])
32
90.625
torch.Size([32, 1, 768])
32
87.5
torch.Size([32, 1, 768])
32
90.625
torch.Size([32, 1, 768])
32
87.5
torch.Size([32, 1, 768])
32
93.75
torch.Size([32, 1, 768])
32
84.375
torch.Size([32, 1, 768])
32
93.75
torch.Size([32, 1, 768])
32
90.625
torch.Size([32, 1, 768])
32
87.5
torch.Size([32, 1, 768])
32
75.0
torch.Size([32, 1, 768])
32
84.375
torch.Size([32, 1, 768])
32
84.375
torch.Size([32, 1, 768])
32
84.375
torch.Size([32, 1, 768])
32
84.375
torch.Size([32, 1, 768])
32
81.25
torch.Size([32, 1, 768])
32
90.625
torch.Size([32, 1, 768])
32
93.75
torch.Size([32, 1, 768])
32
87.5
torch.Size([32, 1, 768])
32
87.5
torch.Size([32, 1, 768])
32
84.375
torch.Size([32, 1, 768])
32
90.625
torch.Size([32, 1, 768])
32
90.625
torch.Size([17, 1, 768])
17
94.11764705882352
