In [1]:
import tensorflow as tf
import tensorflow.keras as keras 

import pandas as pd
import numpy as np
pd.set_option('max_colwidth', None)

In [2]:
raw_df = pd.read_csv("congressional_tweet_training_data.csv", encoding = 'utf-8')
raw_df

Unnamed: 0,favorite_count,full_text,hashtags,retweet_count,year,party_id
0,0,"b""RT @KUSINews: One of our longtime viewers was in Congressman @DarrellIssa's office today and sent us these awesome photos, he has a #KUSI c\xe2\x80\xa6""",KUSI,10,2017.0,R
1,258,"b""Today I'm urging the @CDCgov to immediately launch a 24/7 phone hotline to address questions from Americans regarding the #Coronavirus.\n\nI'm also urging the agency to hold regular calls with state &amp; local health officials to provide up-to-date info &amp; provide any resources needed. https://t.co/xRzNim8RHM""",Coronavirus,111,2020.0,R
2,0,"b'Tomorrow, #MO03 seniors graduate from Calvary Lutheran, Ft. Zumwalt East, Ft. Zumwalt West, Ft. Zumwalt North and Ft. Zumwalt South (2/3)'",MO03,2,2014.0,R
3,9,b'Congrats to #TeamUSA and Canton Native @JGreenway12 on winning the #WorldJuniors gold medal last night! https://t.co/p9VBIe9B6B',TeamUSA WorldJuniors,3,2017.0,R
4,3,"b'Pleased to support @amergateways at their June Fiesta, which honored #ImmigrantHeritageMonth. It provides critical legal &amp; advocacy services to Central Texas immigrant &amp; refugee communities. Thanks to Exec Director Rebecca Lightsey, staff, &amp; volunteers for your important work. https://t.co/VLrOR1IbiG'",ImmigrantHeritageMonth,3,2019.0,D
...,...,...,...,...,...,...
592798,3,"b'This time, it focused on careers in #publicservice and #publicsafety. https://t.co/7DuBnwTaZH'",publicservice publicsafety,0,2017.0,R
592799,5,"b'.#StormyDaniels, #MichaelWolfe, #JamesComey Making the FBI Proud Again... https://t.co/n3cJZ2MF4z'",StormyDaniels MichaelWolfe JamesComey,1,2018.0,R
592800,33,"b'@NRDems The American people deserve the truth and Congress deserves the information we request. But this Administration has time and again refused, so we\xe2\x80\x99re going to get the subpoena power to get what we need to combat this #CultureOfCorruption. https://t.co/2zor7yNHCd'",CultureOfCorruption,14,2020.0,D
592801,4,b'Only 2 weeks left to submit your #app to the Congressional App Challenge! #copolitics #CAC16 #HouseOfCode #co06 https://t.co/WqzRbeo0y6',app copolitics CAC16 HouseOfCode co06,3,2016.0,R


In [3]:
import re
from gensim.parsing.preprocessing import remove_stopwords
class data_pipeline:
    data = None
    def __init__(self,df):
        self.data = list(df["full_text"])
    
    def remove_noise(self):
        index = []
        # Remove Emails
        self.data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in self.data]
        # Remove new line characters
        self.data = [re.sub(r'\s+', ' ', sent) for sent in self.data]
        # Remove distracting single quotes
        self.data = [re.sub("\'", "", sent) for sent in self.data]
        # Remove url
        self.data = [re.sub(r"http\S+", "", sent) for sent in self.data]
        # Remove unicode 
        self.data = [re.sub(r'\\\w{3}','',sent) for sent in self.data]
        #Remove punctuation
        self.data = [re.sub('[^\w\s]','',sent)for sent in self.data]
        #Record short tweets index
        index = [i for i in range(len(self.data)) if len(self.data[i])>=2 and self.data[i][0]=='b']
        #Remove leading b if any
        self.data = [sent[1:] for sent in self.data if len(sent)>=2 and sent[0]=='b']
        #Remove extra white space
        self.data = [re.sub('\\s+',' ',sent) for sent in self.data]
        #Remove stop words
        self.data = [remove_stopwords(sent) for sent in self.data]
        #convert the words to lower case 
        self.data =[sent.lower() for sent in self.data ]
        return self.data,index
        

In [4]:
bronze_df = raw_df.loc[:,['full_text','party_id']]
data_pipeline = data_pipeline(bronze_df)
cleaned_data,index = data_pipeline.remove_noise()

In [5]:
target = bronze_df.iloc[index,1].to_numpy()


In [6]:
target_num = []
for i in target:
    if(i=="R"):
        target_num.append(1)
    else:
        target_num.append(0)
        
target_num = np.array(target_num)

In [7]:
len(target_num)

566818

### Basic NLP

In [8]:
from collections import Counter

# Count unique words
def counter_word(text_arr):
    count = Counter()
    for i in text_arr:
        for word in i.split():
            count[word] += 1
    return count

counter = counter_word(cleaned_data)



In [9]:
counter

Counter({'rt': 21981,
         'one': 2871,
         'longtime': 185,
         'viewers': 31,
         'congressman': 1241,
         'office': 10584,
         'today': 63798,
         'sent': 1986,
         'awesome': 571,
         'photos': 954,
         'kusi': 8,
         'c': 295,
         'im': 26406,
         'urging': 1999,
         'immediately': 1356,
         'launch': 817,
         '247': 193,
         'phone': 1137,
         'hotline': 404,
         'address': 6483,
         'questions': 5070,
         'americans': 26207,
         'coronavirusn': 101,
         'agency': 1152,
         'hold': 3775,
         'regular': 308,
         'calls': 1520,
         'state': 13244,
         'amp': 136808,
         'local': 9647,
         'health': 28226,
         'officials': 2679,
         'provide': 7226,
         'uptodate': 268,
         'info': 3739,
         'resources': 6742,
         'needed': 2814,
         'tomorrow': 6727,
         'mo03': 76,
         'seniors': 3586,
    

In [10]:
num_words = len(counter)

# Max number of words in a sequence
max_length = 49

In [11]:
print(num_words)

242982


Train / test split

In [12]:
train = np.array(cleaned_data)
train_size = int(train.shape[0] * 0.9)
val_size = int(train.shape[0]*0.05)

train_sentences = train[:train_size]
train_labels = target_num[:train_size]

val_sentences = train[train_size+1:train_size+val_size]
val_labels = target_num[train_size+1:train_size+val_size]

test_sentences = train[train_size+val_size+1:-1]
test_labels = target_num[train_size+val_size+1:-1]

In [13]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_sentences)

In [14]:
word_index = tokenizer.word_index

In [15]:
word_index

{'amp': 1,
 'i': 2,
 'today': 3,
 'the': 4,
 'we': 5,
 'great': 6,
 'work': 7,
 'act': 8,
 'this': 9,
 'help': 10,
 'health': 11,
 'need': 12,
 'time': 13,
 'im': 14,
 'americans': 15,
 'house': 16,
 'people': 17,
 'support': 18,
 'proud': 19,
 'day': 20,
 'families': 21,
 'congress': 22,
 'new': 23,
 'american': 24,
 'rt': 25,
 'thank': 26,
 'care': 27,
 'w': 28,
 'us': 29,
 'its': 30,
 'women': 31,
 'covid19': 32,
 'years': 33,
 'senate': 34,
 'community': 35,
 'country': 36,
 'president': 37,
 'working': 38,
 'week': 39,
 'join': 40,
 'n': 41,
 'protect': 42,
 'thanks': 43,
 'jobs': 44,
 'vote': 45,
 'continue': 46,
 'fight': 47,
 'trump': 48,
 'my': 49,
 'like': 50,
 'communities': 51,
 'happy': 52,
 'honor': 53,
 'year': 54,
 'national': 55,
 'state': 56,
 'in': 57,
 'students': 58,
 'bipartisan': 59,
 'veterans': 60,
 'live': 61,
 'important': 62,
 'it': 63,
 'tax': 64,
 'passed': 65,
 'economy': 66,
 'as': 67,
 'workers': 68,
 'lives': 69,
 'discuss': 70,
 'right': 71,
 'good': 

In [16]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)

In [17]:
train_sequences[0]

[25, 474, 5220, 15029, 1155, 84, 3, 721, 2295, 1531, 29160, 3655]

In [18]:
from keras_preprocessing.sequence import pad_sequences

train_padded = pad_sequences(
    train_sequences, maxlen=max_length, padding="post", truncating="post"
)

In [19]:
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(
    test_sequences, maxlen=max_length, padding="post", truncating="post"
)

In [20]:
val_sequences = tokenizer.texts_to_sequences(val_sentences)
val_padded = pad_sequences(
    val_sequences, maxlen=max_length, padding="post", truncating="post"
)

Check inverse

In [21]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [22]:
def decode(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text])

In [23]:
decode(train_sequences[0])

'rt one longtime viewers congressman office today sent awesome photos kusi c'

In [24]:
print(f"Shape of train {train_padded.shape}")
print(f"Shape of train {val_padded.shape}")
print(f"Shape of test {test_padded.shape}")

Shape of train (510136, 49)
Shape of train (28339, 49)
Shape of test (28340, 49)


In [25]:
embedding_dict = {}
with open("glove.twitter.27B.200d.txt", "r") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], "float32")
        embedding_dict[word] = vectors
f.close()

In [26]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 200))

for word, i in word_index.items():
    if i < num_words:
        emb_vec = embedding_dict.get(word)
        if emb_vec is not None:
            embedding_matrix[i] = emb_vec

In [27]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.12293   ,  0.20598   ,  0.056996  , ...,  0.45835999,
         0.13722   ,  0.66996998],
       [ 0.056404  ,  0.49535999,  0.18438999, ...,  0.63598001,
        -0.18880001, -0.035558  ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [28]:
print(train_padded.shape)
print(train_labels.shape)

print(test_padded.shape)
print(test_labels.shape)

print(val_padded.shape)
print(val_labels.shape)

(510136, 49)
(510136,)
(28340, 49)
(28340,)
(28339, 49)
(28339,)


In [29]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout,Bidirectional
from keras.initializers import Constant
from keras.optimizers import Adam

model = Sequential([
     Embedding(num_words,
        200,
        embeddings_initializer=Constant(embedding_matrix),
        input_length=max_length),
    Bidirectional(LSTM(256,dropout = 0.5)),
    Dense(256,activation ="relu"),
    Dense(1, activation="sigmoid")
    
    
    
])

# model.add(
#     Embedding(num_words,
#         100,
#         embeddings_initializer=Constant(embedding_matrix),
#         input_length=max_length,
#         trainable=False))
# model.add(LSTM(50, dropout=0.2))
# model.add(Dense(1, activation="sigmoid"))


optimizer = Adam(learning_rate= 0.0007)

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer=optimizer, metrics=["accuracy"])



Metal device set to: Apple M1


2022-07-04 14:59:21.738178: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-07-04 14:59:21.738264: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [30]:
# from keras.models import Sequential
# from keras.layers import Embedding, LSTM, Dense, Dropout
# from keras.initializers import Constant
# from keras.optimizers import Adam

# model = Sequential()

# model.add(Embedding(num_words, 32, input_length=max_length))
# model.add(LSTM(64, dropout=0.1))
# model.add(Dense(1, activation="sigmoid"))


# optimizer = Adam(learning_rate=0.001)

# model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer=optimizer, metrics=["accuracy"])

In [31]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 49, 200)           45561400  
                                                                 
 bidirectional (Bidirectiona  (None, 512)              935936    
 l)                                                              
                                                                 
 dense (Dense)               (None, 256)               131328    
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                                 
Total params: 46,628,921
Trainable params: 46,628,921
Non-trainable params: 0
_________________________________________________________________


In [32]:
mod= model.fit(
    train_padded, train_labels, epochs=5, validation_data=(val_padded, val_labels), batch_size= 512
)

Epoch 1/5


2022-07-04 14:59:22.154810: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
  return dispatch_target(*args, **kwargs)
2022-07-04 14:59:23.524510: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-04 14:59:23.766445: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-04 14:59:23.783584: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-04 14:59:24.277419: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-04 14:59:24.292778: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-07-04 15:05:41.975060: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-04 15:05:42.104539: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-04 15:05:42.116547: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [33]:
prediction = model.predict(test_padded)

2022-07-04 15:33:19.260600: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-04 15:33:19.362106: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-04 15:33:19.373165: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




In [37]:
test_labels

array([1, 1, 0, ..., 1, 1, 1])

In [38]:
pred = []
for i in prediction:
    if i >=0.5:
        pred.append(1)
    else:
        pred.append(0)

In [39]:
from sklearn.metrics import accuracy_score
print(f"the accuracy of the model is {accuracy_score(test_labels,pred)}")

the accuracy of the model is 0.8779463655610444
