In [1]:
positive = ["Dogecoin to the moon"]
negative = ["I will short Tesla today"]

In [12]:
sentences = positive + negative
print(sentences)

['Dogecoin to the moon', 'I will short Tesla today']


In [18]:
numbers = [sorted(sentence.split(" ")) for sentence in sentences]

In [20]:
sorted_words = []

for sentence in numbers:
    sorted_words.extend(sentence)

In [24]:
sorted_words = sorted(sorted_words)
print(sorted_words)

['Dogecoin', 'I', 'Tesla', 'moon', 'short', 'the', 'to', 'today', 'will']


In [46]:
hmap = {}

unique_val = 1.00
for word in sorted_words:
    if word not in hmap:
        hmap[word] = unique_val
        unique_val += 1

In [47]:
hmap

{'Dogecoin': 1.0,
 'I': 2.0,
 'Tesla': 3.0,
 'moon': 4.0,
 'short': 5.0,
 'the': 6.0,
 'to': 7.0,
 'today': 8.0,
 'will': 9.0}

In [48]:
torch.set_printoptions(precision=2, threshold=None, edgeitems=None, linewidth=None, profile=None, sci_mode=None)

In [51]:
import torch
import torch.nn as nn
from torchtyping import TensorType
positive_tensor = torch.tensor([hmap[word] for word in positive[0].split(" ")]).float()
negative_tensor = torch.tensor([hmap[word] for word in negative[0].split(" ")]).float()
print(positive_tensor.float())
print(negative_tensor.float())

tensor([1., 7., 6., 4.])
tensor([2., 9., 5., 3., 8.])


In [52]:
nn.utils.rnn.pad_sequence([positive_tensor, negative_tensor], batch_first=True)

tensor([[1., 7., 6., 4., 0.],
        [2., 9., 5., 3., 8.]])

In [74]:
positive = ["Dogecoin to the moon", "Dhaya is a bunda"]
negative = ["I will short Tesla today", "Apple beats Tesla tomorrow"]
sentences = []
sentences.extend(positive)
sentences.extend(negative)
final_sentence = " ".join(sentences)

print(final_sentence)

indices = sorted(final_sentence.split(" "))
print(indices)



Dogecoin to the moon Dhaya is a bunda I will short Tesla today Apple beats Tesla tomorrow
['Apple', 'Dhaya', 'Dogecoin', 'I', 'Tesla', 'Tesla', 'a', 'beats', 'bunda', 'is', 'moon', 'short', 'the', 'to', 'today', 'tomorrow', 'will']


In [75]:
hmap = {}

unique_value = 1.00

for index in indices:
    if index not in hmap:
        hmap[index] = unique_value
        unique_value += 1
        
print(hmap)

{'Apple': 1.0, 'Dhaya': 2.0, 'Dogecoin': 3.0, 'I': 4.0, 'Tesla': 5.0, 'a': 6.0, 'beats': 7.0, 'bunda': 8.0, 'is': 9.0, 'moon': 10.0, 'short': 11.0, 'the': 12.0, 'to': 13.0, 'today': 14.0, 'tomorrow': 15.0, 'will': 16.0}


In [88]:
positive_encodings = [[hmap[word] for word in sent.split(" ")] for sent in sentences]

In [92]:
def create_embeddings(sentences):
    encodings = []
    for sent in sentences:
        encodings.append(torch.tensor([hmap[word] for word in sent.split(" ")]))
        
    encode = nn.utils.rnn.pad_sequence(encodings, batch_first=True)
    
    return encode

In [98]:
positive_encodings = create_embeddings(positive + negative)   
print(positive_encodings)


tensor([[ 3., 13., 12., 10.,  0.],
        [ 2.,  9.,  6.,  8.,  0.],
        [ 4., 16., 11.,  5., 14.],
        [ 1.,  7.,  5., 15.,  0.]])


In [101]:
positive_encodings[2:]

tensor([[ 4., 16., 11.,  5., 14.],
        [ 1.,  7.,  5., 15.,  0.]])

In [187]:
class Solution(nn.Module):
    def __init__(self, vocab_size:int):
        super().__init__()
        
        torch.manual_seed(0)
        self.vocab_size = vocab_size
        
        self.embeds = nn.Embedding(self.vocab_size, 16)
        self.layer1 = nn.Linear(16, 1)
        self.sigmoid1 = nn.Sigmoid()
        
    def forward(self, x: TensorType[int]) -> TensorType[int]:
        x = x.long()
        embedded = self.embeds(x)
        print(embedded)
        average = torch.mean(embedded, dim=-1)
        layer1_out = self.layer1(average)
        sigmoid_output = self.sigmoid1(layer1_out)
        
        
        
        return torch.round(sigmoid_output, decimals=4)

In [188]:
x = [
  [2.0, 7.0, 14.0, 8.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,  0.0, 0.0, 0.0, 0.0],    
  [1.0, 4.0, 12.0, 3.0, 10.0, 5.0, 15.0, 11.0, 6.0, 9.0, 13.0, 7.0] 
]

In [None]:
sol = Solution(vocab_size=17000)
sol.forward(torch.tensor(x[0]))

In [3]:
raw_dataset = "Hello darkness my old friend"

data = raw_dataset.split(" ")

data[0:3]

['Hello', 'darkness', 'my']

In [21]:
import torch
import torch.nn as nn
from torchtyping import TensorType
from typing import List, Tuple

class Solution:
    
    def __init__(self):
        self.raw_dataset = None
    
    def batch_loader(self, raw_dataset:str, context_length: int, batch_size: int) -> Tuple[List[List[str]]]:
        
        torch.manual_seed(0)
        self.raw_dataset = raw_dataset
        X = []
        Y = []
        for i in range(batch_size):
            data = raw_dataset.split(" ")
            print(len(data)-context_length)
            random_index = torch.randint(0, len(data) - context_length, ())
            
            X.append(data[random_index:random_index + context_length])
            Y.append(data[random_index+1: random_index + context_length +1])
        
        return (X, Y)
        


In [22]:
torch.randint(0,6,())

tensor(5)

In [23]:
sol = Solution()
sol.batch_loader(raw_dataset, context_length=3, batch_size=2)

2
2


([['Hello', 'darkness', 'my'], ['darkness', 'my', 'old']],
 [['darkness', 'my', 'old'], ['my', 'old', 'friend']])

In [24]:
!git clone https://github.com/gptandchill/sentiment-analysis
%cd sentiment-analysis

Cloning into 'sentiment-analysis'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 6 (delta 1), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (6/6), 116.33 KiB | 3.32 MiB/s, done.
Resolving deltas: 100% (1/1), done.
/home/csgrad/sunilruf/int_point/code/sentiment-analysis


In [28]:
import pandas as pd

df = pd.read_csv("/home/csgrad/sunilruf/int_point/code/sentiment-analysis/EcoPreprocessed.csv")

df.head()

Unnamed: 0,3870,able play youtube alexa,0.5,positive
0,62,able recognize indian accent really well drop ...,0.2794,positive
1,487,absolute smart device amazon connect external ...,0.1827,positive
2,3204,absolutely amaze new member family control hom...,0.3682,positive
3,1265,absolutely amaze previously sceptical invest m...,0.2333,positive
4,53,absolutely cheat customer if buy amazon produc...,0.135,positive


In [39]:
df['0.5'][df['positive'] == 'neutral']

7       0.0
9       0.0
11      0.0
18      0.0
24      0.0
       ... 
4073    0.0
4074    0.0
4079    0.0
4080    0.0
4081    0.0
Name: 0.5, Length: 536, dtype: float64

In [41]:
labels = df['0.5'].to_list()
data = df['able play youtube alexa'].to_list()

final_sentence = " ".join(data)
print(final_sentence)

able recognize indian accent really well drop function helpful call device talk person near device smart plug schedule work seamlessly con would sound kindloud but lack clarity mid frequency need tweeked optimum clarity rarely device doesnt respond call alexa absolute smart device amazon connect external sub woofer sound amaze recons voice even close room like almost collection songs english hindi must quite moneys worth absolutely amaze new member family control home voice connect home anywhere world absolutely amaze previously sceptical invest money but arrive worth ityou absolutely buy wont regret cheer absolutely cheat customer if buy amazon product definitely want buy amazon prime members also case if song want play absolutely need amazon prime membership otherwise can not play music app no google apps not work amazon alexa if anybody want amazon alexa go google home everything also free cost app absolutely house hold item fair price intuitive speech recognition superb use hindi e

In [79]:
words = final_sentence.split(" ")   
len(set(words))

5158

In [33]:
hmap = {}
unique_value = 1.00
for word in sorted(final_sentence.split(" ")):
    if word not in hmap:
        hmap[word] = unique_value
        unique_value += 1

In [34]:
print(hmap)

{'aa': 1.0, 'aap': 2.0, 'aaple': 3.0, 'aaps': 4.0, 'aaya': 5.0, 'abc': 6.0, 'abd': 7.0, 'abilitiesprosit': 8.0, 'ability': 9.0, 'able': 10.0, 'abovethe': 11.0, 'abp': 12.0, 'abrupt': 13.0, 'abruptly': 14.0, 'abs': 15.0, 'absent': 16.0, 'absolute': 17.0, 'absolutely': 18.0, 'abundancevof': 19.0, 'ac': 20.0, 'acc': 21.0, 'acccurate': 22.0, 'accecent': 23.0, 'accemtssound': 24.0, 'accent': 25.0, 'accept': 26.0, 'acceptable': 27.0, 'acceptance': 28.0, 'access': 29.0, 'accessibility': 30.0, 'accessories': 31.0, 'accesssound': 32.0, 'accha': 33.0, 'accord': 34.0, 'accordingly': 35.0, 'account': 36.0, 'accountbut': 37.0, 'accountits': 38.0, 'accoustics': 39.0, 'accuracy': 40.0, 'accuracysound': 41.0, 'accurate': 42.0, 'accurately': 43.0, 'acessories': 44.0, 'acha': 45.0, 'achha': 46.0, 'achi': 47.0, 'achievable': 48.0, 'acho': 49.0, 'acknowledge': 50.0, 'across': 51.0, 'acs': 52.0, 'acsent': 53.0, 'act': 54.0, 'action': 55.0, 'activate': 56.0, 'activation': 57.0, 'active': 58.0, 'activities':

In [35]:
data_encoded = []

for sent in data:
    
    data_encoded.append(torch.tensor([hmap[word] for word in sent.split(" ")]))
    
data_encoded = nn.utils.rnn.pad_sequence(data_encoded, batch_first=True)

In [36]:
print(data_encoded)

tensor([[  10., 3734., 2256.,  ...,    0.,    0.,    0.],
        [  17., 4158., 1215.,  ...,    0.,    0.,    0.],
        [  18.,  217., 3049.,  ...,    0.,    0.,    0.],
        ...,
        [5147., 4444., 3003.,  ...,    0.,    0.,    0.],
        [5152., 3582., 5030.,  ...,    0.,    0.,    0.],
        [5154., 2315.,  767.,  ...,    0.,    0.,    0.]])


In [75]:
vocab_size = len(hmap)
print((vocab_size))

5158


In [141]:
labels = [1 if label>0.5 else 2 if label <0.5 else 0 for label in labels]

In [145]:
labels = torch.squeeze(torch.tensor(labels), dim=-1)
print(labels)

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1,
        1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1,
        1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

  labels = torch.squeeze(torch.tensor(labels), dim=-1)


In [143]:
class EmotionPredictor(nn.Module):
    def __init__(self, vocabulary_size: int, embedding_dimension: int):
        super().__init__()
        print(vocabulary_size)
        self.embedding_layer = nn.Embedding(5159, embedding_dimension)
        self.linear_layer = nn.Linear(embedding_dimension, 3)
        #self.tanh = nn.Tanh()

    def forward(self, x):
        embeddings = self.embedding_layer(x)
        averaged = torch.mean(embeddings, axis = 1)
        projected = self.linear_layer(averaged)
        return (projected)

In [136]:
len(labels)


1000

In [148]:
embedding_dimension = 256
model = EmotionPredictor(vocab_size+1, embedding_dimension)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

for i in range(1000):
  #print(len(data_encoded))
  randperm = torch.randperm(len(data_encoded))
  training_dataset, training_labels = data_encoded[randperm], labels[randperm]
  mini_batch = training_dataset[:64].long()
  mini_batch_labels = training_labels[:64]
  #print(mini_batch_labels)
  #print(len(mini_batch))
  pred = model(mini_batch)
  print(pred)
  optimizer.zero_grad()
  loss = loss_function(pred, mini_batch_labels)
  if i % 100 == 0:
    print(loss)
  loss.backward()
  optimizer.step()

101
tensor([[ 0.3329,  0.0189,  0.5070],
        [ 0.0302, -0.2573,  0.0112],
        [ 0.0427, -0.1594,  0.1955],
        [-0.1313,  0.2155, -0.1901],
        [-0.2414, -0.1832, -0.0121],
        [-0.2454, -0.2620,  0.0818],
        [ 0.0907,  0.0279,  0.0885],
        [-0.4089,  0.0324,  0.0667],
        [-0.0764,  0.1706,  0.3467],
        [ 0.1768, -0.0087,  0.4096],
        [ 0.2132, -0.4284,  0.0441],
        [ 0.0754, -0.0958,  0.3666],
        [ 0.3202, -0.1843,  0.1360],
        [ 0.4117,  0.0079, -0.0294],
        [-0.3179, -0.3955,  0.2711],
        [-0.1916, -0.3879,  0.3344],
        [ 0.3220, -0.0041,  0.0887],
        [ 0.0062, -0.1675,  0.0217],
        [ 0.5740, -0.0946, -0.1841],
        [-0.0777,  0.0404, -0.3025],
        [-0.1330, -0.4683,  0.0481],
        [ 0.2086, -0.1397,  0.4601],
        [-0.3650, -0.4514,  0.2791],
        [ 0.1581, -0.2429,  0.2227],
        [ 0.0033,  0.1400,  0.2767],
        [ 0.2762, -0.4193,  0.1248],
        [ 0.0481,  0.0321,  0.2407

In [104]:
import torch
import torch.nn as nn
import torch.optim as optim

class EmotionPredictor(nn.Module):
    def __init__(self, vocab_size, embedding_dimension, num_classes):
        super(EmotionPredictor, self).__init__()
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dimension)
        self.linear_layer = nn.Linear(embedding_dimension, num_classes)
        
    def forward(self, x):
        embeddings = self.embedding_layer(x)
        averaged = torch.mean(embeddings, dim=1)
        logits = self.linear_layer(averaged)
        return logits


In [107]:
# Hyperparameters
vocab_size = 100  # Define your vocabulary size
embedding_dimension = 256
num_classes = 10  # Adjust based on your classification problem
learning_rate = 0.001
num_epochs = 1000
batch_size = 64

# Initialize model, loss function, and optimizer
model = EmotionPredictor(vocab_size, embedding_dimension, 3)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Example data (replace this with your actual data)
data_encoded = torch.randint(0, vocab_size, (1000, 10))  # 1000 samples, each with 10 indices
labels = torch.randint(0, num_classes, (1000,))  # Example labels for classification

# Training loop
for epoch in range(num_epochs):
    randperm = torch.randperm(len(data_encoded))
    training_dataset, training_labels = data_encoded[randperm], labels[randperm]
    
    for i in range(0, len(training_dataset), batch_size):
        mini_batch = training_dataset[i:i+batch_size].long()
        mini_batch_labels = training_labels[i:i+batch_size]
        
        # Forward pass
        logits = model(mini_batch)
        
        # Compute loss
        loss = loss_function(logits, mini_batch_labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    if epoch % 100 == 0:
        print(f"Epoch [{epoch}/{num_epochs}], Loss: {loss.item():.4f}")


IndexError: Target 9 is out of bounds.