In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense , Input , LSTM , Embedding
from tensorflow.keras.models import Model, Sequential

In [None]:
# Load dataset
file_path = "/content/drive/MyDrive/Project/Tweet/Tweets Dataset.csv"

In [None]:
def load_data(data_file):
  df = pd.read_csv(data_file)

  # replace nan(no value) comment with ""(empty string)
  df.fillna("", inplace=True)

  Tweet = df['Tweet'].tolist()
  Partys = df['Party'].tolist()

  Party = [0 if Party == "Democrat" else 1 for Party in Partys]

  return Tweet, Party

In [None]:
tweet, label = load_data(file_path)
tweets = np.array(tweet)
labels = np.array(label)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tweets, labels,
                                                    test_size=0.2,
                                                    stratify=labels,
                                                    random_state=42)

In [None]:
top_words = 50000
max_comment_length = 300
embedding_vecor_length = 768

In [None]:
tokenizer = Tokenizer(num_words=top_words)

In [None]:
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
X_train_pad = pad_sequences(X_train_seq, maxlen=max_comment_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_comment_length)

In [None]:
model = Sequential([
    Embedding(input_dim=top_words+1, output_dim=embedding_vecor_length, input_length=max_comment_length),
    LSTM(100, return_sequences=False),
    Dense(1, activation='sigmoid')
])
model.build(input_shape=(None, max_comment_length))

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(X_train_pad, y_train,
                    validation_data=(X_test_pad, y_test),
                    epochs=4,
                    batch_size=64,
                    verbose=1)

In [None]:
model.save("/content/drive/MyDrive/Project/Tweet/RNN/RNN.h5")

In [None]:
from keras.models import load_model
model = load_model("/content/drive/MyDrive/Project/Tweet/RNN/RNN.h5")

In [None]:
list_tokenized_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(list_tokenized_test, maxlen=max_comment_length)
prediction = model.predict(X_test)
y_pred = (prediction > 0.5)
print("Accuracy of the model : ", accuracy_score(y_pred, y_test))

In [None]:
def predict_party(comment):
    comment_seq = tokenizer.texts_to_sequences([comment])
    comment_pad = pad_sequences(comment_seq, maxlen=max_comment_length)
    prob = model.predict(comment_pad)[0][0]
    gender = "Republican" if prob >= 0.5 else "Democrat"

    return gender

In [None]:
def predict_probability(comments):
    comment_seq = tokenizer.texts_to_sequences(comments)
    comment_pad = pad_sequences(comment_seq, maxlen=max_comment_length)
    probs = model.predict(comment_pad)
    probs = np.column_stack([1 - probs, probs])  # [Democrat, Republican]

    return probs

In [None]:
tweet = "Hurricane Maria left approx $90 billion in damages, yet only $1 billion was allocated for rebuilding grid. No surprâ€¦ https://t.co/2kU8BcKwUh"
print(predict_party(tweet))

In [None]:
tweet = "Check out my op-ed on need for End Executive Overreach Act: The White House is crippling our economy https://t.co/XCmjLB8Qyd via @DCExaminer"
predict_party(tweet)