# Classification in AOT

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
import re
from sklearn.model_selection import train_test_split
import numpy as np
from gensim.models import word2vec
from nltk.tokenize import word_tokenize
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
import itertools
from pymorphy2 import MorphAnalyzer
from nltk.corpus import stopwords
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from collections import defaultdict
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
!wget https://raw.githubusercontent.com/diptamath/covid_fake_news/main/data/Constraint_Train.csv

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
df = pd.read_csv('Constraint_Train.csv')

In [4]:
df.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


## Наивные методы

### OneHotEncoding

Убираем ошибку, которая может возникнуть при работе с pymorphy2

In [5]:
def pymorphy2_311_hotfix():
    from inspect import getfullargspec
    from pymorphy2.units.base import BaseAnalyzerUnit

    def _get_param_names_311(klass):
        if klass.__init__ is object.__init__:
            return []
        args = getfullargspec(klass.__init__).args
        return sorted(args[1:])

    setattr(BaseAnalyzerUnit, '_get_param_names', _get_param_names_311)

pymorphy2_311_hotfix()

Проводим разбиение на токены, леммы и one hot кодирование

In [6]:
def get_one_hot(df):
    lemms = []
    morph = MorphAnalyzer()    
    stops = stopwords.words("english")
    
    for row in df.tweet:
        row = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", " ", row)
        row = re.sub("[^a-zA-Z]"," ", row)
        lemms.append([morph.parse(token)[0].normal_form for token in word_tokenize(row.lower()) if token not in ' )(.,1/\?#-@:;&^%$_+=' and token not in stops and len(token) > 2])
    df['lemms'] = [' '.join(lemm) for lemm in lemms]
    
    unique_words = set()
    for sentence in lemms:
        for word in sentence:
            unique_words.add(word)

    word_to_index = {}
    for i, word in enumerate(unique_words):
        word_to_index[word] = i

    one_hot_vectors = []
    for sentence in lemms:
        sentence_vectors = []
        for word in sentence:
            vector = np.zeros(len(unique_words), dtype='int')
            vector[word_to_index[word]] = 1
            sentence_vectors.append(vector)
        one_hot_vectors.append(sentence_vectors)
        
    print('ok')
    return one_hot_vectors

In [7]:
one_hot_vectors = get_one_hot(df)

ok


Суммируем векторы слов по текстам для получения векторов текстов

In [8]:
text_one_hot = [sum(sentences) for sentences in one_hot_vectors]

Приводим метки к числовому типу

In [9]:
df.label = pd.get_dummies(df.label, drop_first=True, dtype=int)
df

Unnamed: 0,id,tweet,label,lemms
0,1,The CDC currently reports 99031 deaths. In gen...,1,cdc currently reports deaths general discrepan...
1,2,States reported 1121 deaths a small rise from ...,1,states reported deaths small rise last tuesday...
2,3,Politically Correct Woman (Almost) Uses Pandem...,0,politically correct woman almost uses pandemic...
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,1,indiafightscorona covid testing laboratories i...
4,5,Populous states can generate large case counts...,1,populous states generate large case counts loo...
...,...,...,...,...
6415,6416,A tiger tested positive for COVID-19 please st...,0,tiger tested positive covid please stay away p...
6416,6417,???Autopsies prove that COVID-19 is??� a blood...,0,autopsies prove covid blood clot pneumonia oug...
6417,6418,_A post claims a COVID-19 vaccine has already ...,0,post claims covid vaccine already developed ca...
6418,6419,Aamir Khan Donate 250 Cr. In PM Relief Cares Fund,0,aamir khan donate relief cares fund


Обучим модель линейной регрессии на основе нашего one hot кодирования. Вероятность ~91%

In [10]:
model = LogisticRegression(random_state=42)
model.fit(text_one_hot[:6000], df.label[:6000])
model.score(text_one_hot[6000:], df.label[6000:])

0.9166666666666666

In [11]:
print(classification_report(df.label[6000:], model.predict(text_one_hot[6000:])))

              precision    recall  f1-score   support

           0       0.88      0.95      0.92       202
           1       0.95      0.89      0.92       218

    accuracy                           0.92       420
   macro avg       0.92      0.92      0.92       420
weighted avg       0.92      0.92      0.92       420



Обучим RandomForestClassifier, получим вероятность ~86%

In [12]:
forest = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=20)
forest.fit(text_one_hot[:6000], df.label[:6000])
forest.score(text_one_hot[6000:], df.label[6000:])

0.85

In [13]:
print(classification_report(df.label[6000:], forest.predict(text_one_hot[6000:])))

              precision    recall  f1-score   support

           0       0.80      0.93      0.86       202
           1       0.92      0.78      0.84       218

    accuracy                           0.85       420
   macro avg       0.86      0.85      0.85       420
weighted avg       0.86      0.85      0.85       420



### CountVectorizer

In [14]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df.lemms)

Обучим модель линейной регрессии на основе CountVectorizer. Вероятность ~91%

In [15]:
count_vect_model = LogisticRegression(random_state=42)
count_vect_model.fit(X[:6000], df.label[:6000])
count_vect_model.score(X[6000:], df.label[6000:])

0.9166666666666666

In [16]:
print(classification_report(df.label[6000:], count_vect_model.predict(X[6000:])))

              precision    recall  f1-score   support

           0       0.88      0.95      0.92       202
           1       0.95      0.89      0.92       218

    accuracy                           0.92       420
   macro avg       0.92      0.92      0.92       420
weighted avg       0.92      0.92      0.92       420



Обучим RandomForestClassifier, получим вероятность ~87%

In [17]:
count_vect_forest = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=20)
count_vect_forest.fit(X[:6000], df.label[:6000])
count_vect_forest.score(X[6000:], df.label[6000:])

0.8595238095238096

In [18]:
print(classification_report(df.label[6000:], count_vect_forest.predict(X[6000:])))

              precision    recall  f1-score   support

           0       0.80      0.94      0.87       202
           1       0.93      0.78      0.85       218

    accuracy                           0.86       420
   macro avg       0.87      0.86      0.86       420
weighted avg       0.87      0.86      0.86       420



### Word2Vec + TfIdf weighted

Делаем векторизацию с помощью word2vec

In [19]:
texts = [[token for token in row.split()] for row in df.lemms]

%time model_en = word2vec.Word2Vec(texts, workers=1, vector_size=300, min_count=10, window=5, sample=1e-3, alpha=0.07, min_alpha=0.001, sg=1, seed=42)

CPU times: total: 1.33 s
Wall time: 1.29 s


In [20]:
model_en.init_sims()

Считаем TfIdf

In [21]:
tfidf = TfidfVectorizer()
tfidf.fit(df.lemms) 
word_idf_weight = defaultdict(lambda: max(tfidf.idf_), [(word, tfidf.idf_[i]) for word, i in tfidf.vocabulary_.items()])

Получаем взвешенный эмбендинг за счет весов TfIdf

In [22]:
def get_embedding(text):
    result = []
    
    for word in text:
        if word in model_en.wv:
            result.append(model_en.wv[word] * word_idf_weight[word])
    
    if len(result):
        result = np.sum(result, axis=0)
    else:
        result = np.zeros(300)
    
    return result

In [23]:
features = [get_embedding(text) for text in texts]

Обучим модель LogisticRegression

In [24]:
model2 = LogisticRegression(random_state=42)
model2.fit(features[:6000], df.label[:6000])

Получили результат хуже чем при OneHot энкодинг

In [25]:
model2.score(features[6000:], df.label[6000:])

0.9023809523809524

In [26]:
print(classification_report(df.label[6000:], model2.predict(features[6000:])))

              precision    recall  f1-score   support

           0       0.89      0.92      0.90       202
           1       0.92      0.89      0.90       218

    accuracy                           0.90       420
   macro avg       0.90      0.90      0.90       420
weighted avg       0.90      0.90      0.90       420



Обучим модель RandomForestClassifier. Вероятность ~91%. Но немного хуже чем у OneHot энкодинг

In [27]:
forest2 = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=20)
forest2.fit(features[:6000], df.label[:6000])
forest2.score(features[6000:], df.label[6000:])

0.9119047619047619

In [28]:
print(classification_report(df.label[6000:], forest2.predict(features[6000:])))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91       202
           1       0.93      0.89      0.91       218

    accuracy                           0.91       420
   macro avg       0.91      0.91      0.91       420
weighted avg       0.91      0.91      0.91       420



Обучим модель KNeighborsClassifier. Вероятность ~90%. Хуже чем у OneHot энкодинг и RandomForestClassifier

In [29]:
knn = KNeighborsClassifier()
knn.fit(features[:6000], df.label[:6000])
knn.score(features[6000:], df.label[6000:])

0.9

In [30]:
print(classification_report(df.label[6000:], knn.predict(features[6000:])))

              precision    recall  f1-score   support

           0       0.88      0.92      0.90       202
           1       0.92      0.88      0.90       218

    accuracy                           0.90       420
   macro avg       0.90      0.90      0.90       420
weighted avg       0.90      0.90      0.90       420



**Почему модель LogisticRegression показала значения хуже чем при OneHotEncoding ведь обработка в этот раз лучше?**

### Thematic modeling

In [31]:
import gensim.corpora as corpora
from gensim.models import ldamodel
import random

Сделаем корпус для обучения модели LdaModel и обучим ее

In [32]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
random.seed(42)

In [33]:
lda = ldamodel.LdaModel(corpus=corpus,
                        id2word=dictionary,
                        num_topics=25,
                        alpha='auto',
                        eta='auto',
                        iterations = 20,
                        passes = 5)

In [34]:
lda.show_topics(5)

[(11,
  '0.027*"coronavirus" + 0.024*"covid" + 0.019*"took" + 0.019*"cures" + 0.016*"one" + 0.015*"canada" + 0.014*"tips" + 0.014*"tweet" + 0.013*"safety" + 0.013*"nashville"'),
 (17,
  '0.041*"covid" + 0.032*"coronavirus" + 0.027*"children" + 0.024*"people" + 0.022*"old" + 0.017*"staff" + 0.017*"early" + 0.015*"get" + 0.014*"could" + 0.013*"place"'),
 (6,
  '0.135*"cases" + 0.063*"covid" + 0.051*"total" + 0.049*"new" + 0.046*"number" + 0.044*"confirmed" + 0.032*"case" + 0.027*"active" + 0.026*"report" + 0.019*"today"'),
 (2,
  '0.060*"covid" + 0.029*"people" + 0.025*"spread" + 0.022*"amp" + 0.016*"health" + 0.014*"help" + 0.013*"learn" + 0.013*"facility" + 0.013*"others" + 0.012*"face"'),
 (4,
  '0.065*"covid" + 0.029*"amp" + 0.016*"health" + 0.010*"child" + 0.009*"continue" + 0.009*"available" + 0.009*"use" + 0.009*"work" + 0.009*"working" + 0.009*"cdc"')]

К каким топикам и с какой вероятностью относится первый текст в датасете

In [35]:
lda.get_document_topics(corpus)[0]

[(2, 0.12732011),
 (5, 0.23361748),
 (12, 0.15188138),
 (15, 0.086752415),
 (16, 0.3440987)]

Сделаем эмбендинг на основе тематического моделирования. Так как lda.get_document_topics предоставляет не 25 топиков с вероятностями, а зачастую меньше, то сначала создадим список с 25 нулевыми значениями, а потом будет заполнять значения в порядке выданных тем.

In [36]:
def get_embedding_lda(text):
    result = []
    
    for bow_text in lda.get_document_topics(corpus):
        vector = [0.0]*25
        for i, elem in bow_text:
            vector[i] = elem 
        result.append(vector)
    return result

In [37]:
bow_features = get_embedding_lda(texts)

In [40]:
model3 = LogisticRegression(random_state=42)
model3.fit(bow_features[:6000], df.label[:6000])
model3.score(bow_features[6000:], df.label[6000:])

0.7880952380952381

In [41]:
print(classification_report(df.label[6000:], model3.predict(bow_features[6000:])))

              precision    recall  f1-score   support

           0       0.80      0.75      0.77       202
           1       0.78      0.82      0.80       218

    accuracy                           0.79       420
   macro avg       0.79      0.79      0.79       420
weighted avg       0.79      0.79      0.79       420



In [42]:
bow_forest = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=20)
bow_forest.fit(bow_features[:6000], df.label[:6000])
bow_forest.score(bow_features[6000:], df.label[6000:])

0.830952380952381

In [43]:
print(classification_report(df.label[6000:], bow_forest.predict(bow_features[6000:])))

              precision    recall  f1-score   support

           0       0.82      0.83      0.83       202
           1       0.84      0.83      0.84       218

    accuracy                           0.83       420
   macro avg       0.83      0.83      0.83       420
weighted avg       0.83      0.83      0.83       420



Достаточно неплохие показатели. ~81%

**Есть какие-то советы/возможности для улучшения показателей тематического моделирования?**

## CNN и RNN

### PyTorch Simple CNN

In [44]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.input_layer = nn.Linear(300, 150)
        self.out = nn.Linear(150, 1)

    def forward(self, x):
        x = F.relu(self.input_layer(x))
        return torch.sigmoid(self.out(x))
    
net = Net()
print(net)

Net(
  (input_layer): Linear(in_features=300, out_features=150, bias=True)
  (out): Linear(in_features=150, out_features=1, bias=True)
)


In [45]:
optimizer = optim.SGD(net.parameters(), lr=0.01)
criterion = nn.BCEWithLogitsLoss()

In [46]:
def get_torch_embedding(text):
    result = []
    
    for word in text.split():
        if word in model_en.wv:
            result.append(model_en.wv[word])
    
    if len(result):
        result = np.average(result, axis=0)
    else:
        result = np.zeros(300)
    
    return result

In [47]:
features = [get_torch_embedding(text) for text in tqdm(df.lemms)]

100%|██████████| 6420/6420 [00:00<00:00, 26593.29it/s]


In [48]:
X_train, X_test, y_train, y_test = train_test_split(features, df.label.to_list(), test_size=0.25)

In [49]:
in_data = torch.tensor(X_train).float()
targets = torch.tensor(y_train).float()

In [52]:
def train_one_epoch(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size]
        batch_y = targets[i:i + batch_size]
        optimizer.zero_grad()
        output = net(batch_x)
        loss = criterion(output.squeeze(), batch_y)
        loss.backward()
        optimizer.step()
    print(loss)

In [53]:
for i in range(10):
    train_one_epoch(in_data, targets)

100%|██████████| 301/301 [00:00<00:00, 1725.73it/s]


tensor(0.5519, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1797.39it/s]


tensor(0.5518, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1800.84it/s]


tensor(0.5518, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1822.40it/s]


tensor(0.5518, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1837.74it/s]


tensor(0.5517, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1780.89it/s]


tensor(0.5517, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1768.16it/s]


tensor(0.5517, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1785.64it/s]


tensor(0.5517, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1799.90it/s]


tensor(0.5517, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1836.41it/s]

tensor(0.5516, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)





In [54]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

In [55]:
with torch.no_grad():
    output = net(in_data_test).squeeze(1)

In [56]:
result = (output.cpu() > 0.5) == targets_test

In [57]:
result.sum().item() / len(result)

0.8722741433021807

### PyTorch + LSTM

In [94]:
class NetLSTM(nn.Module):

    def __init__(self):
        super(NetLSTM, self).__init__()
        self.lstm = nn.LSTM(300, 150)
        self.out = nn.Linear(150, 1)

    def forward(self, x):
        embeddings, (shortterm, longterm) = self.lstm(x.transpose(0, 1))
        prediction = torch.sigmoid(self.out(longterm))
        return prediction


Net_LSTM = NetLSTM()
print(Net_LSTM)

NetLSTM(
  (lstm): LSTM(300, 150)
  (out): Linear(in_features=150, out_features=1, bias=True)
)


Возьмем взвешенный эмбединг на основе Word2Vec с TfIdf

In [95]:
features_LSTM = [get_embedding(text) for text in texts]

In [96]:
X_train, X_test, y_train, y_test = train_test_split(features_LSTM, df.label.to_list(), test_size=0.25)

In [105]:
in_data = torch.tensor(X_train).float()
targets = torch.tensor(y_train).float()

In [106]:
in_data.shape

torch.Size([4815, 300])

In [110]:
optimizer = optim.SGD(Net_LSTM.parameters(), lr=0.01)
criterion = nn.MSELoss()

In [111]:
def train_one_epoch(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size]
        batch_y = targets[i:i + batch_size]
        optimizer.zero_grad()
        output = Net_LSTM(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer.step()
    print(loss)

In [113]:
Net_LSTM.train()
for i in range(5):
    train_one_epoch(in_data, targets)

100%|██████████| 301/301 [00:03<00:00, 84.01it/s]


tensor(0.2507, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:03<00:00, 84.65it/s]


tensor(0.2509, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:03<00:00, 83.41it/s]


tensor(0.2509, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:03<00:00, 83.94it/s]


tensor(0.2510, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:03<00:00, 82.31it/s]

tensor(0.2509, grad_fn=<MseLossBackward0>)





In [114]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

with torch.no_grad():
    output = Net_LSTM(in_data_test).reshape(-1)
    
result = (output.cpu() > 0.5) == targets_test
result.sum().item() / len(result)

0.5383177570093458

**Ниже попробовал использовать выходной слой LSTM, тут вроде более менее результат. Правда нужно его еше улучшить немного чтобы был больше 0.91%**

In [65]:
class NewNetLSTM(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=300, hidden_size=128, num_layers=1, batch_first=True)
        self.lin = nn.Linear(128, 64)
        self.out = nn.Linear(64, 1)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = F.relu(self.lin(x))
        prediction = torch.sigmoid(self.out(x))
        return prediction
    

New_Net_LSTM = NewNetLSTM()
print(New_Net_LSTM)

NewNetLSTM(
  (lstm): LSTM(300, 128, batch_first=True)
  (lin): Linear(in_features=128, out_features=64, bias=True)
  (out): Linear(in_features=64, out_features=1, bias=True)
)


In [66]:
def new_train_one_epoch(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size]
        batch_y = targets[i:i + batch_size]
        optimizer.zero_grad()
        output = New_Net_LSTM(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer.step()
    print(loss)

optimizer = optim.SGD(New_Net_LSTM.parameters(), lr=0.01)
criterion = nn.MSELoss()

In [67]:
New_Net_LSTM.train()
for i in range(10):
    new_train_one_epoch(in_data, targets)

100%|██████████| 301/301 [00:00<00:00, 594.05it/s]


tensor(0.1630, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:00<00:00, 677.73it/s]


tensor(0.0926, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:00<00:00, 666.09it/s]


tensor(0.0757, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:00<00:00, 661.38it/s]


tensor(0.0744, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:00<00:00, 699.58it/s]


tensor(0.0739, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:00<00:00, 711.92it/s]


tensor(0.0727, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:00<00:00, 721.83it/s]


tensor(0.0719, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:00<00:00, 679.46it/s]


tensor(0.0722, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:00<00:00, 723.54it/s]


tensor(0.0718, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:00<00:00, 691.73it/s]

tensor(0.0717, grad_fn=<MseLossBackward0>)





In [68]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

with torch.no_grad():
    output = New_Net_LSTM(in_data_test).reshape(-1)
    
result = (output.cpu() > 0.5) == targets_test
result.sum().item() / len(result)

0.897196261682243

**Решил поэксперементировать LSTM в связке с Conv слоями, но как-то не оч. По функции потерь видно что модель не обучается особо.**

**+ проблема с batch_size. Для Conv1d первый параметр равен batch_size и если при тренировке мы его можем задать равным batch_size, то при оценке на тестовых данных мы туда закидываем сразу все данные (1000+) и модель ломается (ошибка есть ниже)**

**Как в реальных задачах решается проблема с первым параметром для Conv1d? Или обучают и тестируют батчами одного и того же размера? (В этой работе в первый трогаю torch, но в Keras таких проблем не встречал)**

**Куда копать? Или Conv слои с LSTM лучше не использовать и пробовать что-то другое?**

In [69]:
class NewNetLSTM(nn.Module):
    def __init__(self, batch_size=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size=300, hidden_size=128, num_layers=1, batch_first=True)
        self.conv1 = nn.Conv1d(batch_size, 128, kernel_size=3)
        self.pool = nn.MaxPool1d(2, 2)
        self.flattened_tensor = nn.Flatten()
        self.lin = nn.Linear(504, 32)
        self.out = nn.Linear(32, 1)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = x.view(-1, 16)
        x = self.flattened_tensor(x)
        x = torch.transpose(x, 0, 1)
        x = F.relu(self.lin(x))
        prediction = torch.sigmoid(self.out(x))
        return prediction
    

New_Net_LSTM = NewNetLSTM()
print(New_Net_LSTM)

NewNetLSTM(
  (lstm): LSTM(300, 128, batch_first=True)
  (conv1): Conv1d(1, 128, kernel_size=(3,), stride=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (flattened_tensor): Flatten(start_dim=1, end_dim=-1)
  (lin): Linear(in_features=504, out_features=32, bias=True)
  (out): Linear(in_features=32, out_features=1, bias=True)
)


In [70]:
def new_train_one_epoch(in_data, targets, batch_size=16):
    global New_Net_LSTM
    New_Net_LSTM = NewNetLSTM(batch_size)
    
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size]
        batch_y = targets[i:i + batch_size]
        if len(batch_x) != batch_size:
            continue
        optimizer.zero_grad()
        output = New_Net_LSTM(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer.step()
    print(loss)

optimizer = optim.SGD(New_Net_LSTM.parameters(), lr=0.01)
criterion = nn.MSELoss()

In [71]:
New_Net_LSTM.train()
for i in range(10):
    new_train_one_epoch(in_data, targets)

100%|██████████| 301/301 [00:00<00:00, 517.17it/s]


tensor(0.2481, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:00<00:00, 507.85it/s]


tensor(0.2557, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:00<00:00, 512.82it/s]


tensor(0.2620, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:00<00:00, 514.08it/s]


tensor(0.2477, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:00<00:00, 515.68it/s]


tensor(0.2534, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:00<00:00, 523.64it/s]


tensor(0.2523, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:00<00:00, 515.04it/s]


tensor(0.2505, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:00<00:00, 509.33it/s]


tensor(0.2574, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:00<00:00, 517.74it/s]


tensor(0.2557, grad_fn=<MseLossBackward0>)


100%|██████████| 301/301 [00:00<00:00, 531.46it/s]

tensor(0.2570, grad_fn=<MseLossBackward0>)





In [72]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

with torch.no_grad():
    output = New_Net_LSTM(in_data_test).reshape(-1)
    
result = (output.cpu() > 0.5) == targets_test
result.sum().item() / len(result)

RuntimeError: Given groups=1, weight of size [128, 16, 3], expected input[1, 1605, 128] to have 16 channels, but got 1605 channels instead