In [2]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import numpy as np
import torch.nn as nn
from nltk.corpus import stopwords

In [3]:
train = pd.read_csv('dataset/train.csv')

In [8]:
class TwitterDataset(Dataset):

    def __init__(self, data_dir, split='train', feature ='tf-idf'):
        """Initialize the attributes of the object of the class."""
        
        self.data_dir = data_dir
        self.split = split
        self.df = pd.read_csv(f'{data_dir}/{split}.csv')
        self.feature = feature
        self.extractor = self._feature_extractor(self.df,feature)

    def __len__(self):
        """Return the size of the dataset."""
        return self.df.shape[0]

    def __getitem__(self, index):
        """Return a data sample for a given index, along with the lable of the corresponding tweet"""
        
        
        # - get the image path corresponding to 'index' (use the list 'self.image_path_list')
        data_sample = self.df.iloc[index]
        
        #remove noise        
        cleaned_data = self._clean_data(data_sample['tweet'])
        
        #extract features
        features = self._extract_feature([cleaned_data])
        
        #get one-hot encoded label
        if self.split =='train':
            label = torch.nn.functional.one_hot(torch.tensor(self.df['label'].values))[index]
            
            return cleaned_data, features, label
        
        else:
            return cleaned_data, features

    def _clean_data(self, data):
        
        """ Remove hyperlinks, mentions, hashtag from the data
            convert to lowercase
            remove non-ASCII characters
            remove the @user tag
            removing stopwords
        """
        
        #TO DO: add more
        data = data.lower()
        data = re.sub(r'[^\x00-\x7F]+',' ', data)
        data = re.sub(r"@\S*|#\S*",'',data)
        lines = filter(lambda x: x[0]!= '@' , data.split())
        lines = [word for word in lines if word not in set(stopwords.words('english'))]
        data = " ".join(lines)
        return data
              
    
    def _extract_feature(self,data):
        
        #To DO add glove and BERT feature extractor
        
        if self.feature =='tf-idf':
            
            features = self.extractor.transform(data).toarray()
            

            return torch.tensor(features)

        
    def _feature_extractor(self,df,feature):
        
        
        if feature =='tf-idf':
            extractor = TfidfVectorizer()
            return extractor.fit(df['tweet'].values)
            

         

In [9]:
dataset_train = TwitterDataset('dataset','train','tf-idf')
dataset_test = TwitterDataset('dataset','test','tf-idf')

In [10]:
len(dataset_train)

31962

In [11]:
text, feature , label = dataset_train[158]
feature.shape

torch.Size([1, 41392])

In [12]:
batch_size = 8
train_dataloader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(dataset_test, batch_size=batch_size, shuffle=True)

In [13]:
batch_example = next(iter(train_dataloader))
tweet_batch_example = batch_example[1]
labels_batch_example = batch_example[2]

print(tweet_batch_example.shape)
print(labels_batch_example.shape)

torch.Size([8, 1, 41392])
torch.Size([8, 2])


In [14]:
class RNN(nn.Module):
    def __init__(self, input_size, number_layers ,hidden_size, output_size, act_fn):
        super(Classifier, self).__init__()
        
        self.input_layer = nn.Sequential(nn.Linear(input_size, hidden_size), act_fn)
        self.hidden_layers = nn.ModuleList([nn.Sequential(nn.Linear(hidden_size, hidden_size), act_fn) for i in range(number_layers)])
        self.output_layer = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        
        y = self.input_layer(x)
        
        for hidden in self.hidden_layers:
            y = hidden(y)
            
        out = self.output_layer(y)
        return out


In [15]:
rnn = nn.LSTM(input_size=tweet_batch_example.shape[-1], hidden_size = 10, num_layers = 5, bidirectional=True)
input = tweet_batch_example 
h0 = torch.randn(2*5, 1, 10) ## total layers = 5, bidirectional = True,hence 2*5
c0 = torch.randn(2*5, 1, 10) ## total layers = 5, bidirectional = True,hence 2*5
output, (hn, cn) = rnn(input, (h0, c0))

output.shape

RuntimeError: expected scalar type Double but found Float