In [None]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertModel
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import torch

In [None]:
def load_data(data_file):
  # read csv file
  df = pd.read_csv(data_file)
  df.fillna("", inplace=True)
  df = df.drop_duplicates()
  comments = df['Tweet'].tolist()
  genders = df['Party'].tolist()
  genders = [0 if gender == "Democrat" else 1 for gender in genders]

  return comments, genders

In [None]:
# Load dataset
file_path = "C:/Users/ADMIN PC/Desktop/Tweet/Tweets Dataset.csv"

In [None]:
tweet, label = load_data(file_path)
tweets = np.array(tweet)
labels = np.array(label)

In [None]:
MAX_SEQUENCE_LENGTH = 300
MAX_NUM_WORDS = 50000

In [None]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)

In [None]:
tokenizer.fit_on_texts(tweet)
sequences = tokenizer.texts_to_sequences(tweet)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(labels)

In [None]:
data.shape, labels.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, labels,
                                                    test_size=0.2,
                                                    stratify=labels,
                                                    random_state=42)

In [None]:
EMBEDDING_DIM = 768
num_words = MAX_NUM_WORDS
embedding_layer = Embedding(num_words,EMBEDDING_DIM,input_length=MAX_SEQUENCE_LENGTH,trainable=True)

In [None]:
sequence_input = Input(shape=(300, ))
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(64, kernel_size=3, activation='relu')(embedded_sequences)
x = Conv1D(64, kernel_size=3, activation='relu')(x)
x = MaxPooling1D(pool_size=2)(x)
x = Flatten()(x)
x = Dense(100, activation='relu')(x)
preds = Dense(2, activation='sigmoid')(x)

model = Model(sequence_input, preds)

In [None]:
model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])

In [None]:
model.summary()

In [None]:
# Train the model with validation data
model.fit(X_train, y_train, batch_size=64, epochs=10)


In [None]:
model.save("C:/Users/ADMIN PC/Desktop/Tweet/CNN/CNN.h5")

In [None]:
from keras.models import load_model
model = load_model('C:/Users/ADMIN PC/Desktop/Tweet/CNN/CNN.h5')

In [None]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

In [None]:
def predict_proba(arr):
    sequences_new = tokenizer.texts_to_sequences(arr)
    data = pad_sequences(sequences_new, maxlen=MAX_SEQUENCE_LENGTH)
    yprob = model.predict(data, verbose=0)
    return yprob

In [None]:
def predict_party(text):
    """Predicts party label based on text."""
    arr = np.array([text])  # Wrap text in an array
    yprob = predict_proba(arr)[0]

    return "Democrat" if np.argmax(yprob) == 0 else "Republican"

In [None]:
tweet = "Hurricane Maria left approx $90 billion in damages, yet only $1 billion was allocated for rebuilding grid. No surprâ€¦ https://t.co/2kU8BcKwUh"
print(predict_party(tweet))

In [None]:
tweet = "Check out my op-ed on need for End Executive Overreach Act: The White House is crippling our economy https://t.co/XCmjLB8Qyd via @DCExaminer"
print(predict_party(tweet))