In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from tqdm import tqdm 

In [2]:
df_train = pd.read_csv("./Twitter Sentiment Analysis/train.csv")
df_train['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

большой дисбаласн классов, подготовим 3 выборки, 
- X_train_samp - продублируем класс1 для сбалансированности, и на этой выборке будем производить обучение сети
- X_train - выборку оставим без изменения что бы при обучении считать метрику на train
- X_val - валидационная выборка

In [3]:
X_train, X_val = train_test_split(df_train, test_size=0.15, random_state=42)

In [4]:
X_train['label'].value_counts()

0    25252
1     1915
Name: label, dtype: int64

In [5]:
class0 = X_train.loc[X_train['label']==0]
class1 = X_train.loc[X_train['label']==1]
X_train_samp = class0.append(pd.DataFrame(np.repeat(class1.values, 13, axis=0), columns=class1.columns), ignore_index=True)

In [6]:
y_train_samp = X_train_samp['label'].astype(np.int32)
X_train_samp = X_train_samp.drop(columns=['label'])
y_train_samp.value_counts()

0    25252
1    24895
Name: label, dtype: int64

In [7]:
y_train = X_train['label'].astype(np.int32)
X_train = X_train.drop(columns=['label'])
y_train.value_counts()

0    25252
1     1915
Name: label, dtype: int64

In [8]:
y_val = X_val['label']
X_val = X_val.drop(columns=['label'])
y_val.value_counts()

0    4468
1     327
Name: label, dtype: int64

## Предпроцессинг текста

In [9]:
max_words = 2000
max_len = 20
num_classes = 1

In [10]:
sw = set(get_stop_words("en"))
puncts = set(punctuation)

morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in puncts)
    txt = txt.lower()
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

In [11]:
tqdm.pandas()
X_train_samp['tweet'] = X_train_samp['tweet'].progress_apply(preprocess_text)
X_train['tweet'] = X_train['tweet'].progress_apply(preprocess_text)
X_val['tweet'] = X_val['tweet'].progress_apply(preprocess_text)

100%|██████████████████████████████████████████████████████████████████████████| 50147/50147 [00:10<00:00, 4577.23it/s]
100%|██████████████████████████████████████████████████████████████████████████| 27167/27167 [00:05<00:00, 4801.94it/s]
100%|████████████████████████████████████████████████████████████████████████████| 4795/4795 [00:01<00:00, 4740.55it/s]


In [12]:
train_corpus = " ".join(X_train["tweet"])
train_corpus = train_corpus.lower()

In [13]:
nltk.download("punkt")
tokens = word_tokenize(train_corpus)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\spvag\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
tokens_filtered = [word for word in tokens if word.isalnum()]
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]  # вычитание 1 для padding
len(tokens_filtered_top)

1999

In [15]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}
# vocabulary

In [16]:
import numpy as np

def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])

    padding = [0] * (maxlen-len(result))
    return result[-maxlen:] + padding

In [17]:
%%time
x_train = np.asarray([text_to_sequence(text, max_len) for text in X_train["tweet"]])
x_valid = np.asarray([text_to_sequence(text, max_len) for text in X_val["tweet"]])
x_train_samp = np.asarray([text_to_sequence(text, max_len) for text in X_train_samp["tweet"]])

Wall time: 8.82 s


## Модель

In [18]:
class Net(nn.Module):
    def __init__(self, vocab_size=2000, embedding_dim=256, out_channel=128, num_classes=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv_1 = nn.Conv1d(embedding_dim, out_channel, kernel_size=2)#2
        self.conv_2 = nn.Conv1d(out_channel, out_channel*2, kernel_size=3)#3
        self.conv_3 = nn.Conv1d(out_channel*2, out_channel*4, kernel_size=3)#3        
        self.pool = nn.MaxPool1d(2)
        self.relu = nn.LeakyReLU()
        self.linear_1 = nn.Linear(out_channel*4, out_channel*8)
        self.linear_2 = nn.Linear(out_channel*8, out_channel*4)        
        self.linear_3 = nn.Linear(out_channel*4, num_classes)
        self.dp = nn.Dropout(0.65)
        self.bn_1 = nn.BatchNorm1d(out_channel)
        self.bn_2 = nn.BatchNorm1d(out_channel*2)        
        self.bn_3 = nn.BatchNorm1d(out_channel*4)        
        
        
    def forward(self, x):        
        output = self.embedding(x) # B, L, E      
        output = output.permute(0, 2, 1)
        output = self.conv_1(output)
        output = self.relu(output)
        output = self.bn_1(output)  
        output = self.pool(output)

        output = self.conv_2(output)
        output = self.relu(output)
        output = self.bn_2(output)        
        output = self.pool(output)
       
        output = self.conv_3(output)
        output = self.relu(output)  
        output = self.bn_3(output)      
        output = torch.max(output, axis=2).values
        output = self.linear_1(output)
        output = self.relu(output)
        output = self.dp(output)
        output = self.linear_2(output)
        output = self.relu(output)
        output = self.dp(output)        
        output = self.linear_3(output)        
        output = F.sigmoid(output)
        return output

In [19]:
from torch.utils.data import DataLoader, Dataset


class DataWrapper(Dataset):
    def __init__(self, data, target, transform=None):
        self.data = torch.from_numpy(data).long()
        self.target = torch.from_numpy(target).long()
        self.transform = transform
        
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        
        if self.transform:
            x = self.transform(x)
            
        return x, y
    
    def __len__(self):
        return len(self.data)

In [20]:
epochs = 20
batch_size = 542
print_batch_n = 100

In [21]:
train_dataset = DataWrapper(x_train, y_train.values)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = DataWrapper(x_valid, y_val.values)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)

train_dataset_samp = DataWrapper(x_train_samp, y_train_samp.values)
train_loader_samp = DataLoader(train_dataset_samp, batch_size=batch_size, shuffle=True)

In [22]:
model = Net(vocab_size=max_words)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [23]:
print(model)
print("Parameters:", sum([param.nelement() for param in model.parameters()]))

Net(
  (embedding): Embedding(2000, 256)
  (conv_1): Conv1d(256, 128, kernel_size=(2,), stride=(1,))
  (conv_2): Conv1d(128, 256, kernel_size=(3,), stride=(1,))
  (conv_3): Conv1d(256, 512, kernel_size=(3,), stride=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (relu): LeakyReLU(negative_slope=0.01)
  (linear_1): Linear(in_features=512, out_features=1024, bias=True)
  (linear_2): Linear(in_features=1024, out_features=512, bias=True)
  (linear_3): Linear(in_features=512, out_features=1, bias=True)
  (dp): Dropout(p=0.65, inplace=False)
  (bn_1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn_2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn_3): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
Parameters: 2122369


In [24]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
criterion = nn.BCELoss()

In [25]:
def metrics(model, loader):
    model.eval()
    running_total = 0.0
    tp = 0.0
    tp_fp = 0.0
    tp_fn = 0.0
    running_right = 0.0
    for j, data in enumerate(loader):
        labels = data[1].to(device)
        outputs = model(data[0].to(device))
        running_total += len(data[1])
        pred_labels = torch.squeeze((outputs > th).int())
        tp += (labels*pred_labels).sum()
        tp_fp += pred_labels.sum()
        tp_fn += labels.sum()
        running_right += (labels == pred_labels).sum()
    precision = tp/tp_fp
    recall = tp/tp_fn
    f1_score = 2*(precision*recall)/(precision+recall)
    accuracy = running_right/running_total 
    model.train()    
    return precision, recall, f1_score, accuracy
    

In [26]:
model = model.to(device)
model.train()
th = 0.5

train_history = []
test_history = []

for epoch in range(epochs):  
    running_items, running_right = 0.0, 0.0
    for i, data in enumerate(train_loader_samp, 0):
        inputs, labels = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.float().view(-1, 1))
        loss.backward()
        optimizer.step()
        loss = loss.item()
    precision_val, rerall_val, f1_score_val, acc_val = metrics(model, val_loader)
    precision_train, rerall_train, f1_score_train, acc_train = metrics(model, train_loader)    
    print(f'Epoch [{epoch + 1}/{epochs}]. ' \
            f'Step [{i + 1}/{len(train_loader_samp)}].  ' \
            f'Loss: {loss:.3f}.  ' \
            f'Train f1_score: {f1_score_train:.3f}.  ' \
            f'Test f1_score: {f1_score_val:.3f}.'         
         )
    train_history.append([precision_train, rerall_train, f1_score_train, acc_train])
    test_history.append([precision_val, rerall_val, f1_score_val, acc_val])    
        
print('Training is finished!')



Epoch [1/20]. Step [93/93].  Loss: 0.688.  Train f1_score: 0.213.  Test f1_score: 0.210.
Epoch [2/20]. Step [93/93].  Loss: 0.672.  Train f1_score: 0.246.  Test f1_score: 0.221.
Epoch [3/20]. Step [93/93].  Loss: 0.638.  Train f1_score: 0.264.  Test f1_score: 0.240.
Epoch [4/20]. Step [93/93].  Loss: 0.595.  Train f1_score: 0.285.  Test f1_score: 0.253.
Epoch [5/20]. Step [93/93].  Loss: 0.523.  Train f1_score: 0.317.  Test f1_score: 0.285.
Epoch [6/20]. Step [93/93].  Loss: 0.474.  Train f1_score: 0.358.  Test f1_score: 0.302.
Epoch [7/20]. Step [93/93].  Loss: 0.395.  Train f1_score: 0.414.  Test f1_score: 0.335.
Epoch [8/20]. Step [93/93].  Loss: 0.367.  Train f1_score: 0.441.  Test f1_score: 0.343.
Epoch [9/20]. Step [93/93].  Loss: 0.299.  Train f1_score: 0.486.  Test f1_score: 0.355.
Epoch [10/20]. Step [93/93].  Loss: 0.225.  Train f1_score: 0.564.  Test f1_score: 0.381.
Epoch [11/20]. Step [93/93].  Loss: 0.202.  Train f1_score: 0.614.  Test f1_score: 0.398.
Epoch [12/20]. Step

## Выводы:

В данных наблюдался большой дисбаланс классов. В силу того что в задании не были указаны конкретные метрики, в качестве оценки обучения сети я выбрал метрику f1, 
Accuracy не удачная метрика в данном случае.


Для повышения метрики сети можно:
-	выбрать более сложный алгоритм предобработки текста. 
-	увеличить объём обучающих данных. Количество «твитов» 1-го класса слишком мало, поэтому сеть при обучении очень быстро переходит в режим переобучения, даже при больших значениях дропаута. 

 Так же думаю, что увеличение количества слоёв или параметров сети не окажет значительного влияния на улучшение её характеристик. 
