# Фреймворк PyTorch для разработки искусственных нейронных сетей

## Урок 6. Нейросети в обработке текста

### Самостоятельно обучить классификатор текстов на примере 20newsgroups
### На примере 20 newsgroups попробовать разные параметры для сверток для классификации текстов

In [None]:
import torch
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.datasets import fetch_20newsgroups

In [None]:
categories = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [None]:
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

print('words in train:',len(newsgroups_train.data))
print('words in test:',len(newsgroups_test.data))

words in train: 11314
words in test: 7532


In [None]:
vocab = Counter()

for text in newsgroups_train.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1
        
for text in newsgroups_test.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1
        
total_words = len(vocab)

def get_word_2_index(vocab):
    word2index = {}
    for i,word in enumerate(vocab):
        word2index[word.lower()] = i
        
    return word2index

word2index = get_word_2_index(vocab)

In [None]:
def get_batch(df,i,batch_size):
    batches = []
    results = []
    texts = df.data[i*batch_size:i*batch_size+batch_size]
    categories = df.target[i*batch_size:i*batch_size+batch_size]
    for text in texts:
        layer = np.zeros(total_words,dtype=float)
        for word in text.split(' '):
            layer[word2index[word.lower()]] += 1
            
        batches.append(layer)
        
    for category in categories:
        index_y = -1
        if category == 0:
            index_y = 0
        elif category == 1:
            index_y = 1
        else:
            index_y = 2
        results.append(index_y)
            
     
    return np.array(batches),np.array(results)

In [None]:
learning_rate = 0.01
num_epochs = 10
batch_size = 150
display_step = 1

hidden_size = 100
input_size = total_words
num_classes = 20  

In [None]:
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class pyNet(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
        super(pyNet, self).__init__()
        self.layer_1 = nn.Linear(input_size,hidden_size, bias=True)
        self.relu = nn.ReLU()
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True)
 
     def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out

In [None]:
net = pyNet(input_size, hidden_size, num_classes)

In [None]:
criterion = nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)  
valid_loss_min = np.inf

for epoch in range(num_epochs):
    total_batch = int(len(newsgroups_train.data)/batch_size)
    net.train()
    for i in range(total_batch):
        batch_x,batch_y = get_batch(newsgroups_train,i,batch_size)
        articles = Variable(torch.FloatTensor(batch_x))
        labels = Variable(torch.LongTensor(batch_y))
        optimizer.zero_grad()
        outputs = net(articles)
        loss_train = criterion(outputs, labels)
        loss_train.backward()
        optimizer.step()
    
    total_batch = int(len(newsgroups_test.data)/batch_size)

    net.eval()
    for i in range(total_batch):
        batch_x,batch_y = get_batch(newsgroups_test,i,batch_size)
        articles = Variable(torch.FloatTensor(batch_x))
        labels = Variable(torch.LongTensor(batch_y))
        outputs = net(articles)
        loss_test = criterion(outputs, labels)
        loss_test.backward()
        optimizer.step()
    print(loss_train, loss_test)
    
    if loss_test <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        loss_test))
        torch.save(net.state_dict(), 'wieghts.pt')
        valid_loss_min = loss_test

tensor(0.0892, grad_fn=<NllLossBackward0>) tensor(4.9438, grad_fn=<NllLossBackward0>)
Validation loss decreased (inf --> 4.943755).  Saving model ...
tensor(1.5637, grad_fn=<NllLossBackward0>) tensor(0.3826, grad_fn=<NllLossBackward0>)
Validation loss decreased (4.943755 --> 0.382610).  Saving model ...
tensor(8.8738, grad_fn=<NllLossBackward0>) tensor(0.6185, grad_fn=<NllLossBackward0>)
tensor(0.5959, grad_fn=<NllLossBackward0>) tensor(1.5276, grad_fn=<NllLossBackward0>)
tensor(0.5962, grad_fn=<NllLossBackward0>) tensor(0.4543, grad_fn=<NllLossBackward0>)
tensor(0.2020, grad_fn=<NllLossBackward0>) tensor(0.3038, grad_fn=<NllLossBackward0>)
Validation loss decreased (0.382610 --> 0.303805).  Saving model ...
tensor(0.1863, grad_fn=<NllLossBackward0>) tensor(0.5992, grad_fn=<NllLossBackward0>)
tensor(0.3114, grad_fn=<NllLossBackward0>) tensor(0.2575, grad_fn=<NllLossBackward0>)
Validation loss decreased (0.303805 --> 0.257459).  Saving model ...
tensor(0.2589, grad_fn=<NllLossBackward0>