In [115]:
!pip install num2words
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import numpy as np
import math
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split



In [116]:
df = pd.read_csv('covid_tweets.csv')
democrat_tweets_df = df[df.party == 'Democrat']
republican_tweets_df = df[df.party == 'Republican']
democrat_tweets_df = democrat_tweets_df.sample(frac = 0.5)
frames = [democrat_tweets_df, republican_tweets_df]
df = pd.concat(frames)

In [117]:
df.head()

Unnamed: 0,name,twitter,party,tweet,tweet_published
27716,Kamala D. Harris,SenKamalaHarris,Democrat,"Farm workers have always been essential, and i...",2020-04-08
36413,Debbie Mucarsel-Powell,RepDMP,Democrat,COVID-19 is surging. Floridians are frustrated...,2020-06-29
33773,Joseph D. Morelle,RepJoeMorelle,Democrat,I’m live on @warm1013 with my friend @TonyWarm...,2020-04-10
18003,Nydia M. Velázquez,NydiaVelazquez,Democrat,This #OlderAmericansMonth we celebrate the con...,2020-05-26
42265,Elaine G. Luria,RepElaineLuria,Democrat,"Recently, @Apple collaborated with @CDCgov to ...",2020-04-03


In [118]:
import tensorflow as tf
tf.test.is_gpu_available( cuda_only=False, min_cuda_compute_capability=None )
print(tf.__version__)

2.3.0


In [119]:
from nltk.tokenize import word_tokenize
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
covid_stop_words = {"covid", "covid19", "covid 19", "corona", "coronavirus"}
porter = PorterStemmer()
def preprocess_and_tokenize(tweet):
    # Remove non-letters, lowercase everything, remove stop words, and stem
    lower_letters = re.sub(r'[^A-Za-z0-9 ]+', " ", tweet).lower().split()
    important_words = []
    for w in lower_letters:
        if w not in stop_words and w not in covid_stop_words:
            important_words.append(w)
    return important_words

In [120]:
df['tokenize_tweet'] = df['tweet'].apply(lambda x : preprocess_and_tokenize(x))

In [121]:
dems, reps = list(), list()
for l in df.party:
    if l == 'Democrat':
        dems.append(1)
        reps.append(0)
    elif l == 'Republican':
        reps.append(1)
        dems.append(0)
df['dems']= dems
df['reps']= reps

In [122]:
df.head()

Unnamed: 0,name,twitter,party,tweet,tweet_published,tokenize_tweet,dems,reps
27716,Kamala D. Harris,SenKamalaHarris,Democrat,"Farm workers have always been essential, and i...",2020-04-08,"[farm, workers, always, essential, long, past,...",1,0
36413,Debbie Mucarsel-Powell,RepDMP,Democrat,COVID-19 is surging. Floridians are frustrated...,2020-06-29,"[covid, 19, surging, floridians, frustrated, w...",1,0
33773,Joseph D. Morelle,RepJoeMorelle,Democrat,I’m live on @warm1013 with my friend @TonyWarm...,2020-04-10,"[live, warm1013, friend, tonywarm1013, talking...",1,0
18003,Nydia M. Velázquez,NydiaVelazquez,Democrat,This #OlderAmericansMonth we celebrate the con...,2020-05-26,"[olderamericansmonth, celebrate, contributions...",1,0
42265,Elaine G. Luria,RepElaineLuria,Democrat,"Recently, @Apple collaborated with @CDCgov to ...",2020-04-03,"[recently, apple, collaborated, cdcgov, develo...",1,0


In [123]:
from sklearn.model_selection import train_test_split
data_train, data_test = train_test_split(df, test_size=0.25, random_state=42)

### We build training vocab/max trailing sentence length/total number of words

In [124]:
all_training_words = [word for tokens in data_train["tokenize_tweet"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokenize_tweet"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

727693 words total, with a vocabulary size of 32772
Max sentence length is 92


### Build testing vocabulary 

In [125]:
all_test_words = [word for tokens in data_test["tokenize_tweet"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokenize_tweet"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

241968 words total, with a vocabulary size of 19018
Max sentence length is 77


In [43]:
### Load up word2vec
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
word2vec.wv.save_word2vec_format('googlenews.txt')

  """


In [126]:
# Tokenize & Pad the Sequences

In [127]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["tweet"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["tweet"].tolist())
train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))
train_rnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 36654 unique tokens.


Now we will get embeddings from Google News Word2Vec model and save them corresponding to the sequence number we assigned to each word. If we could not get embeddings we save a random vector for that word.

In [128]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(36655, 300)


In [129]:
test_sequences = tokenizer.texts_to_sequences(data_test["tweet"].tolist())
test_rnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

### Develop Bi-LSTM

In [130]:
label_names = ['dems', 'reps']

In [131]:
y_train = data_train[label_names].values
x_train = train_rnn_data
y_tr = y_train

In [132]:
import tensorflow as tf
print(tf.test.is_built_with_cuda())
print(tf.config.list_physical_devices('GPU')) 
print(tf.__version__)

False
[]
2.3.0


In [133]:
def recurrent_nn(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    lstm = LSTM(256)(embedded_sequences)
    
    x = Dense(128, activation='relu')(lstm)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [134]:
import tensorflow as tf
# tf.test.is_gpu_available( cuda_only=False, min_cuda_compute_capability=None )
print(tf.__version__)

2.3.0


In [135]:
rnn_model = recurrent_nn(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM,  len(list(label_names)))

Model: "functional_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 50, 300)           10996500  
_________________________________________________________________
lstm_6 (LSTM)                (None, 256)               570368    
_________________________________________________________________
dense_12 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 2)                 258       
Total params: 11,600,022
Trainable params: 603,522
Non-trainable params: 10,996,500
___________________________________

### Train Bi-LSTM

In [136]:
num_epochs = 20
batch_size = 20

In [137]:
hist = rnn_model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.10, shuffle=True, batch_size=batch_size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [140]:
predictions = rnn_model.predict(test_rnn_data, batch_size=512, verbose=1)



In [141]:
labels = [1, 0]
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])
sum(data_test['dems'] == prediction_labels)/len(prediction_labels)

0.7487102523215459

In [163]:
# Creating a CNN
# 1. Embeddings matrix passed to embedding layer
# 2. Five different filter sizese are applied to each tweet
# 3. GlobalMaxPooling1D layers are applied to each layer
# 4. All outputs concatenated
# 5. Dropout -> Dense -> Dropout -> Dense applied
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model

In [164]:
def cnn(embeddings, max_seq_length, num_words, 
        embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                               embedding_dim,
                               weights=[embeddings],
                               input_length=max_seq_length,
                               trainable=False)
    
    sequence_input = Input(shape=(max_seq_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    
    convs = []
    filter_sizes = [2,3,4,5,6]
    
    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, 
                        kernel_size=filter_size, 
                        activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)
        
    l_merge = concatenate(convs, axis=1)
    
    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [165]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300
cnn_model = cnn(train_embedding_weights, 
                MAX_SEQUENCE_LENGTH, 
                len(train_word_index)+1, 
                EMBEDDING_DIM, 
                len(list(label_names)))

Model: "functional_17"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           [(None, 50)]         0                                            
__________________________________________________________________________________________________
embedding_11 (Embedding)        (None, 50, 300)      10996500    input_11[0][0]                   
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 49, 200)      120200      embedding_11[0][0]               
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 48, 200)      180200      embedding_11[0][0]               
______________________________________________________________________________________

In [172]:
num_epochs = 10
batch_size = 20

In [None]:
hist = cnn_model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.10, shuffle=True, batch_size=batch_size)

Epoch 1/10

In [170]:
predictions = cnn_model.predict(test_rnn_data, batch_size=1024, verbose=1)



In [171]:
labels = [1, 0]
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])
sum(data_test['dems'] == prediction_labels)/len(prediction_labels)

0.7736610074101866