In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import classification_report

### Анализ датасета

In [2]:
df = pd.read_csv('./Materials/Constraint_Train.csv',index_col=0)
df.head(3)

Unnamed: 0_level_0,tweet,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,The CDC currently reports 99031 deaths. In gen...,real
2,States reported 1121 deaths a small rise from ...,real
3,Politically Correct Woman (Almost) Uses Pandem...,fake


### создаем embedding для каждого слова

In [6]:
from nltk.tokenize import word_tokenize
from tqdm import tqdm

from gensim.models.word2vec import Word2Vec

In [4]:
from nltk.corpus import stopwords
from string import punctuation

eng_stopwords = stopwords.words('english') + [i for i in punctuation] + ['�','``','’']

In [7]:
result = []

for i in tqdm(df.tweet): 
    result.append( [j for j in word_tokenize(i.lower()) if j not in eng_stopwords ])
    

100%|████████████████████████████████████| 6420/6420 [00:03<00:00, 1850.14it/s]


In [9]:
# Word2Vec
word_to_vec_tweets = Word2Vec(result, workers=4, vector_size=300, min_count=3, window=5, epochs=50)

In [14]:
print( word_to_vec_tweets.wv['virus'][:30] )
word_to_vec_tweets.wv.most_similar('russia')

[-1.5921233  -1.1200765  -1.2448933   0.411763    0.22529747  0.15750995
  0.37977812  0.22201622  0.00494879  0.76706856  0.53205895  0.06524077
  0.25412646 -0.05537247  1.1098365  -0.22657147  0.18047434 -0.8440129
  2.6835637  -0.6430049   2.5529017   0.7119238  -0.6905735   1.9169923
 -0.05899691  0.5002548  -1.7540656   0.8503961   0.50157744  0.06769494]


[('lions', 0.7208154201507568),
 ('putin', 0.7146435379981995),
 ('vladimir', 0.691994845867157),
 ('biotech', 0.661442756652832),
 ('bharat', 0.6567081809043884),
 ('covaxin', 0.590172290802002),
 ('donated', 0.585052490234375),
 ('developed', 0.5800663828849792),
 ('steal', 0.5773712992668152),
 ('company', 0.5667145252227783)]

совпадает отлично эмбединг закончили.

In [243]:
y = np.array( (df.label == 'real').astype(np.int32).to_list() )

result = np.array(result,dtype=object)

### делаем функцию для случайного дропаута слов из предложения

In [252]:
def make_word_dropout(text, probability = .5, fill_value = '#DROP#'):
    '''
        RANDOMLY DROP WORDS FROM GIVEN SENTENCE 
        WITH PROBABILITY "probability"
    '''
    
    mask = np.random.choice(2,size = len(text), p=[1-probability,probability]).reshape(1,-1)

    X = np.choose(mask,[text,np.full(shape = (1,len( text )) ,fill_value = fill_value)])[0]
    return X



In [335]:
print( ' '.join(result[10]) )
make_word_dropout(text=result[10], probability = .5, fill_value = '#DROP#' )


take simple daily precautions help prevent spread respiratory illnesses like covid19 learn protect coronavirus covid-19 https //t.co/uargztrh5l https //t.co/biztxtukyk


array(['#DROP#', 'simple', 'daily', '#DROP#', '#DROP#', '#DROP#',
       'spread', 'respiratory', '#DROP#', '#DROP#', 'covid19', 'learn',
       'protect', '#DROP#', 'covid-19', 'https', '//t.co/uargztrh5l',
       'https', '//t.co/biztxtukyk'], dtype='<U17')

### собираем итератор (+more efficient batching  = батчи будут отсортированы по размеру)

In [336]:
def get_index(length, batch_size=15):
    '''
        RETURN LIST OF DATAFRAME (source) INDEXES WHERE TWEET LENGTH >= "length" 
    '''
    # dict tweet_index:tweet_len
    a = dict(zip(range(len(source)),map(len,source) ))
    #a = kwargs['dict_index_len']
    
    b = [] 
    while len(b) < batch_size:
        for i,j in a.items():
            if j == length:
                b.append(i)
                if len(b) == batch_size:
                    return b
        length +=1
    return b

In [351]:
def get_index(source,length, batch_size=15):
    '''
        RETURN LIST OF DATAFRAME (source) INDEXES WHERE TWEET LENGTH >= "length" 
    '''
    # dict tweet_index:tweet_len
    a = dict(zip(range(len(source)),map(len,source) ))
    
    
    b = [] 
    while len(b) < batch_size:
        for i,j in a.items():
            if j == length:
                b.append(i)
                if len(b) == batch_size:
                    return b
        length +=1
    return b

In [354]:
get_index(result,length=3,batch_size=3)

[141, 508, 571]

In [355]:
def get_batch_indexes(source, BATCH_SIZE = 15, same_length = True):
    '''
        RETURN RANDOM LIST OF DATAFRAME INDEXES WITH SAME TWEET LENGTH (if same_length = True)
             AND RANDOM CHOSEN #BATCH_SIZE# INDEXES
        same_length: True  will return  a batch with the same tweet length //RANDOM CHOSEN TWEET LENGTH
        same_length: False will return  a random choosen batch
    '''
    a = dict(zip(range(len(source)),map(len,source) ))
    
    if same_length:
        length = np.random.choice( list(a.values()) )
        batch_indexes = np.random.choice( get_index(source,length,batch_size = BATCH_SIZE*2 ) ,size = BATCH_SIZE )
    else: 
        batch_indexes = np.random.choice( range(len(source)) ,size = BATCH_SIZE )
        
    return batch_indexes

In [357]:
get_batch_indexes(result,BATCH_SIZE=3)

array([116,  58, 128])

#### Финальная итерирующая функция

In [468]:
# Финальная итерирующая функция

def iterate_minibatches(source,target, batch_size = 15, same_length=True, dropout=1, **kwargs):
    # check for same_length
    if same_length:
        batch_indexes = get_batch_indexes(source,BATCH_SIZE = batch_size)
    else:
        batch_indexes = get_batch_indexes(source,BATCH_SIZE = batch_size, same_length = False)
    
    # make a minibatch 
    X = source[batch_indexes]
    y = target[batch_indexes]
    
    # make dropout for the minibatch
    if dropout !=1:
        X = np.array( [make_word_dropout(text=i , probability = dropout, **kwargs ) for i in X], dtype=object)
    
    # make embeddings for each word in minibatch. 
    # word '#DROP#' will be embedded as np.full(300,1.)
    # paddings '#PAD#' will be embedded as np.zeros(300)
    # paddings requirement are calculated as max_len - len(X[i]) 
    new_X = []
    max_len = max(map(len,X))
    
    for j in X:
        while len(j) < max_len: j = np.append(j,['#PAD#'])
        new_X.append(  
            [word_to_vec_tweets.wv[i] if i in word_to_vec_tweets.wv else np.full(
            (300,), 1.) if '#DROP#' in i else np.zeros(300) for i in j ])
    X = np.array( new_X , dtype=float)
    
    return X,y

In [533]:
def make_whole_source_embedding(X,use_tqdm=True):
    new_X = []
    max_len = max(map(len,X))
    if use_tqdm:
        for j in tqdm(X):
            while len(j) < max_len: j = np.append(j,['#PAD#'])
            new_X.append(  
                [word_to_vec_tweets.wv[i] if i in word_to_vec_tweets.wv else np.full(
                (300,), 1.) if '#DROP#' in i else np.zeros(300) for i in j ])
        X = np.array( new_X , dtype=float)
        
    else:
        for j in X:
            while len(j) < max_len: j = np.append(j,['#PAD#'])
            new_X.append(  
                [word_to_vec_tweets.wv[i] if i in word_to_vec_tweets.wv else np.full(
                (300,), 1.) if '#DROP#' in i else np.zeros(300) for i in j ])
        X = np.array( new_X , dtype=float)
    return X

#### Проверка итерирующей функции

In [358]:
get_batch_indexes(result,BATCH_SIZE = 3)

array([83, 38, 61])

In [469]:
l,n = iterate_minibatches(result,y,batch_size = 3, same_length=False, dropout=1)
n, len(l), list( map(len,l))

(array([0, 1, 0]), 3, [26, 26, 26])

### модель 2directional LSTM + Dense(Linear)

In [212]:
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import train_test_split

In [330]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(result, y, test_size=0.33, random_state=255)
X_train = np.array( X_train, dtype=object)
X_test = np.array( X_test, dtype=object)

In [591]:
# class 2dir_LSTM + DENSE
class Net(torch.nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        self.lstm = torch.nn.LSTM(input_size=300, hidden_size=128,batch_first = True, bidirectional=True) #dropout=.3,num_layers=2
        self.dense = torch.nn.Linear(in_features=128, out_features = 1, bias = True)
        
    def forward(self, x):
        embeddings, (shortterm, longterm) = self.lstm(x)
        longterm = torch.add(longterm[0],longterm[1])/2
        predict = torch.sigmoid(self.dense(longterm))
        return predict
    
net = Net()
print(net)

Net(
  (lstm): LSTM(300, 128, batch_first=True, bidirectional=True)
  (dense): Linear(in_features=128, out_features=1, bias=True)
)


In [None]:
# train
criteria = torch.nn.BCELoss()

EPOCHS = 5
batch_size = 15
one_epoch_size = len(X_train)//batch_size
min_loss = 1e-5

for epoch in range(EPOCHS):
    for iteration in tqdm(range(one_epoch_size)):
        l,n = iterate_minibatches(X_train,y_train,batch_size = batch_size, same_length=True, dropout=1)
        #l = np.array(l, dtype=np.float64)
        l = torch.from_numpy(l).float()

        n = torch.from_numpy(n.reshape(-1,1)).float()
        
        v = net(l)

        loss = criteria(v,n)
        if loss < min_loss: break
        print('loss = ',loss)

        loss.backward()

        optimizer = torch.optim.RMSprop( net.parameters(), lr = 0.01)

        optimizer.step()
        optimizer.zero_grad()
    # at the end of each epoch print the last loss
    print(f'epoch {epoch}. Loss = {loss}')


In [598]:
# train function
def net_train(model = net, EPOCHS = 5, batch_size = 15,min_loss = 1e-5, **kwargs ):
    criteria = torch.nn.BCELoss()
    one_epoch_size = len(X_train)//batch_size

    for epoch in range(EPOCHS):
        for iteration in tqdm(range(one_epoch_size)):
            l,n = iterate_minibatches(X_train,y_train, batch_size = batch_size, same_length=True,  **kwargs )
            #l = np.array(l, dtype=np.float64)
            l = torch.from_numpy(l).float()

            n = torch.from_numpy(n.reshape(-1,1)).float()

            v = model(l)

            loss = criteria(v,n)
            if loss < min_loss: break
            #print('loss = ',loss)

            loss.backward()

            optimizer = torch.optim.RMSprop( net.parameters(), lr = 0.01 )

            optimizer.step()
            optimizer.zero_grad()
        # at the end of each epoch print the last loss
    print(f'epoch {epoch +1}. Loss = {loss}')
    return None

In [593]:
net_train(model = net )

  8%|███▏                                     | 22/286 [00:02<00:32,  8.08it/s]
  2%|█                                         | 7/286 [00:00<00:38,  7.22it/s]
  1%|▌                                         | 4/286 [00:00<00:43,  6.56it/s]
  6%|██▍                                      | 17/286 [00:02<00:34,  7.69it/s]
  1%|▍                                         | 3/286 [00:00<00:30,  9.37it/s]

epoch 5. Loss = 8.110688213491812e-06





#### Проверка на тестовой выборке:
Делаем проверочную функцию для X_test

In [594]:
def net_evaluate(model = net):    
    times = len(X_test) // 100 
    new_v = torch.tensor([])

    for  i in tqdm(range(times+1)):
        try:
            new_X = make_whole_source_embedding(X_test[i*100:i*100+100], use_tqdm=False)
        except:
            new_X = make_whole_source_embedding(X_test[i*100:], use_tqdm=False)

        new_X =  torch.from_numpy(new_X).float()

        with torch.no_grad():
            v = model(new_X)

        new_v = torch.cat([new_v,v])

    new_y = torch.from_numpy(y_test.reshape(-1,1)).float()        
    loss = criteria(new_v,new_y)
    print(f'The test loss is {loss}')

    y_pred = (new_v > 0.5).numpy().astype(int).reshape(1,-1)[0]
    
    assert len(y_pred) == len(X_test)
    accuracy = y_pred == y_test
    
    print('accuracy = ', np.sum(accuracy) / len(accuracy) )
    
    print(classification_report(y_test, y_pred,target_names=['fake','real']))
    return None

net_evaluate(net)

100%|██████████████████████████████████████████| 22/22 [00:09<00:00,  2.41it/s]

The test loss is 8.27254867553711
accuracy =  0.7390278433223219
              precision    recall  f1-score   support

        fake       0.67      0.90      0.77      1012
        real       0.87      0.59      0.70      1107

    accuracy                           0.74      2119
   macro avg       0.77      0.75      0.73      2119
weighted avg       0.77      0.74      0.73      2119






In [685]:
a = {'model':[],'f1_score':[] }


In [686]:
a['model'].append('2directional LSTM + Dense(Linear)')
a['f1_score'].append('0.74')
a

{'model': ['2directional LSTM + Dense(Linear)'], 'f1_score': ['0.74']}

Точность уже 0.74 неплохо. что можно сделать еще.

### модель 2directional LSTM + Dense(Linear) + WORD_Dropout

In [599]:
net_train(model = net,**{'dropout':.5} )

 11%|████▍                                    | 31/286 [00:05<00:41,  6.17it/s]
  0%|                                                  | 0/286 [00:00<?, ?it/s]
 24%|█████████▋                               | 68/286 [00:09<00:29,  7.28it/s]
  3%|█▍                                       | 10/286 [00:01<00:42,  6.45it/s]
  0%|▏                                         | 1/286 [00:00<00:59,  4.76it/s]

epoch 5. Loss = 5.28496173046733e-07





In [600]:
net_evaluate(net)

100%|██████████████████████████████████████████| 22/22 [00:09<00:00,  2.31it/s]

The test loss is 20.221128463745117
accuracy =  0.6281264747522416
              precision    recall  f1-score   support

        fake       0.56      0.99      0.72      1012
        real       0.98      0.30      0.45      1107

    accuracy                           0.63      2119
   macro avg       0.77      0.64      0.59      2119
weighted avg       0.78      0.63      0.58      2119






In [687]:
a['model'].append('2directional LSTM + Dense(Linear) + WORD_Dropout')
a['f1_score'].append('0.63')
a

{'model': ['2directional LSTM + Dense(Linear)',
  '2directional LSTM + Dense(Linear) + WORD_Dropout'],
 'f1_score': ['0.74', '0.63']}

С дропаутом слов оказалось хуже. 
А если с дропаутом скрытых состояний?

### модель 2directional LSTM + Dense(Linear) + LSTM_hidden_state_Dropout

In [602]:
class Net(torch.nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        self.lstm = torch.nn.LSTM(input_size=300, hidden_size=128,batch_first = True, bidirectional=True,dropout=.5,num_layers=2) #dropout=.3,num_layers=2
        self.dense = torch.nn.Linear(in_features=128, out_features = 1, bias = True)
        
    def forward(self, x):
        embeddings, (shortterm, longterm) = self.lstm(x)
        longterm = torch.add(longterm[0],longterm[1])/2
        predict = torch.sigmoid(self.dense(longterm))
        return predict
    
net = Net()
print(net)

Net(
  (lstm): LSTM(300, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (dense): Linear(in_features=128, out_features=1, bias=True)
)


In [None]:
net_train(model = net )

In [608]:
net_evaluate(net)

100%|██████████████████████████████████████████| 22/22 [00:15<00:00,  1.41it/s]

The test loss is 8.553658485412598
accuracy =  0.8692779613025012
              precision    recall  f1-score   support

        fake       0.84      0.89      0.87      1012
        real       0.90      0.85      0.87      1107

    accuracy                           0.87      2119
   macro avg       0.87      0.87      0.87      2119
weighted avg       0.87      0.87      0.87      2119






In [None]:
Точность выросла 0.87.

In [688]:
a['model'].append('2directional LSTM + Dense(Linear) + LSTM_hidden_state_Dropout')
a['f1_score'].append('0.87')
a

{'model': ['2directional LSTM + Dense(Linear)',
  '2directional LSTM + Dense(Linear) + WORD_Dropout',
  '2directional LSTM + Dense(Linear) + LSTM_hidden_state_Dropout'],
 'f1_score': ['0.74', '0.63', '0.87']}

### модель LSTM + Dense(Linear)  without Dropout

In [671]:
class Net(torch.nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        self.lstm = torch.nn.LSTM(input_size=300, hidden_size=128,batch_first = True, bidirectional=False) #dropout=.3,num_layers=2
        self.dense = torch.nn.Linear(in_features=128, out_features = 1, bias = True)
        
    def forward(self, x):
        embeddings, (shortterm, longterm) = self.lstm(x)
        predict = torch.sigmoid(self.dense(longterm))
        return predict.squeeze(0)
    
net = Net()
print(net)

Net(
  (lstm): LSTM(300, 128, batch_first=True)
  (dense): Linear(in_features=128, out_features=1, bias=True)
)


In [627]:
net_train(model = net )

 45%|██████████████████▏                     | 130/286 [00:07<00:08, 18.56it/s]
  6%|██▌                                      | 18/286 [00:01<00:15, 17.82it/s]
  5%|█▊                                       | 13/286 [00:00<00:19, 13.98it/s]
 12%|████▊                                    | 34/286 [00:02<00:15, 16.26it/s]
  0%|▏                                         | 1/286 [00:00<00:17, 16.67it/s]

epoch 5. Loss = 0.0





In [701]:
net_evaluate(model = net)

100%|██████████████████████████████████████████| 22/22 [00:05<00:00,  3.84it/s]

The test loss is 0.6927607655525208
accuracy =  0.5181689476168003
              precision    recall  f1-score   support

        fake       0.33      0.01      0.02      1012
        real       0.52      0.98      0.68      1107

    accuracy                           0.52      2119
   macro avg       0.43      0.50      0.35      2119
weighted avg       0.43      0.52      0.36      2119






Без 2 directional (только LSTM) - результат 0.65

In [689]:
a['model'].append('LSTM + Dense(Linear)')
a['f1_score'].append('0.65')
a

{'model': ['2directional LSTM + Dense(Linear)',
  '2directional LSTM + Dense(Linear) + WORD_Dropout',
  '2directional LSTM + Dense(Linear) + LSTM_hidden_state_Dropout',
  'LSTM + Dense(Linear)'],
 'f1_score': ['0.74', '0.63', '0.87', '0.65']}

### модель LSTM + Dense(Linear)  + LSTM_hidden_state_Dropout

In [661]:
class Net(torch.nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        self.lstm = torch.nn.LSTM(input_size=300, hidden_size=128,batch_first = True, bidirectional=False,dropout=.5,num_layers=2) 
        self.dense = torch.nn.Linear(in_features=128, out_features = 1, bias = True)
        
    def forward(self, x):
        embeddings, (shortterm, longterm) = self.lstm(x)
        longterm = (longterm[0]+longterm[1])/2
        predict = torch.sigmoid(self.dense(longterm))
        return predict
    
net = Net()
print(net)

Net(
  (lstm): LSTM(300, 128, num_layers=2, batch_first=True)
  (dense): Linear(in_features=128, out_features=1, bias=True)
)


In [659]:
net_train(model = net)

 56%|██████████████████████▌                 | 161/286 [00:16<00:12,  9.65it/s]
  8%|███▍                                     | 24/286 [00:03<00:34,  7.59it/s]
  6%|██▍                                      | 17/286 [00:01<00:29,  9.09it/s]
 13%|█████▎                                   | 37/286 [00:04<00:27,  8.96it/s]
  4%|█▌                                       | 11/286 [00:01<00:34,  7.91it/s]

epoch 5. Loss = 5.2770928959944285e-06





In [660]:
net_evaluate(model = net)

100%|██████████████████████████████████████████| 22/22 [00:09<00:00,  2.28it/s]

The test loss is 14.961292266845703
accuracy =  0.6965549787635678
              precision    recall  f1-score   support

        fake       0.72      0.60      0.66      1012
        real       0.68      0.78      0.73      1107

    accuracy                           0.70      2119
   macro avg       0.70      0.69      0.69      2119
weighted avg       0.70      0.70      0.69      2119






LSTM + DENSE + Dropout результат 0.70

In [690]:
a['model'].append('LSTM + Dense(Linear) + LSTM_hidden_state_Dropout')
a['f1_score'].append('0.70')
a

{'model': ['2directional LSTM + Dense(Linear)',
  '2directional LSTM + Dense(Linear) + WORD_Dropout',
  '2directional LSTM + Dense(Linear) + LSTM_hidden_state_Dropout',
  'LSTM + Dense(Linear)',
  'LSTM + Dense(Linear) + LSTM_hidden_state_Dropout'],
 'f1_score': ['0.74', '0.63', '0.87', '0.65', '0.70']}

### LSTM + Dense(Linear) + WORD_Dropout

In [670]:
net_train(model = net,**{'dropout':0.5})

 44%|█████████████████▍                      | 125/286 [00:26<00:34,  4.73it/s]
  3%|█▎                                        | 9/286 [00:01<00:44,  6.29it/s]
 11%|████▌                                    | 32/286 [00:05<00:45,  5.63it/s]
  7%|██▊                                      | 20/286 [00:03<00:46,  5.78it/s]
  0%|▏                                         | 1/286 [00:00<00:51,  5.56it/s]

epoch 5. Loss = 5.881017273168254e-07





In [673]:
net_evaluate(model = net)

100%|██████████████████████████████████████████| 22/22 [00:05<00:00,  3.74it/s]

The test loss is 0.6927607655525208
accuracy =  0.5181689476168003
              precision    recall  f1-score   support

        fake       0.33      0.01      0.02      1012
        real       0.52      0.98      0.68      1107

    accuracy                           0.52      2119
   macro avg       0.43      0.50      0.35      2119
weighted avg       0.43      0.52      0.36      2119






In [691]:
a['model'].append('LSTM + Dense(Linear) + WORD_Dropout')
a['f1_score'].append('0.52')
a

{'model': ['2directional LSTM + Dense(Linear)',
  '2directional LSTM + Dense(Linear) + WORD_Dropout',
  '2directional LSTM + Dense(Linear) + LSTM_hidden_state_Dropout',
  'LSTM + Dense(Linear)',
  'LSTM + Dense(Linear) + LSTM_hidden_state_Dropout',
  'LSTM + Dense(Linear) + WORD_Dropout'],
 'f1_score': ['0.74', '0.63', '0.87', '0.65', '0.70', '0.52']}

###  Results:

In [697]:
results = pd.DataFrame(a)
results.sort_values(by='f1_score',ascending=False,inplace=True)
results

Unnamed: 0,model,f1_score
2,2directional LSTM + Dense(Linear) + LSTM_hidde...,0.87
0,2directional LSTM + Dense(Linear),0.74
4,LSTM + Dense(Linear) + LSTM_hidden_state_Dropout,0.7
3,LSTM + Dense(Linear),0.65
1,2directional LSTM + Dense(Linear) + WORD_Dropout,0.63
5,LSTM + Dense(Linear) + WORD_Dropout,0.52


Самый лучший результат показала модель: 
__"2directional LSTM + Dense(Linear) + LSTM_hidden_state_Dropout"__

с резульатом __f1_score = 0.87__