In [3]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from collections import  Counter
import re
from nltk.tokenize import word_tokenize
import gensim
import string
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from transformers import BertTokenizer, TFBertModel, BertConfig, TFBertForSequenceClassification

In [4]:
# Loading training data
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/final_labels.csv')
training_data = data[data['body'].notna()]
training_data = data[data['body'].str.len() < 200]

In [None]:
# Converting target labels to 1,0
X = data['body']
label = LabelEncoder()
y = label.fit_transform(data['level_1'])

In [None]:
# Train, validation, test split of training data
X_train, X_dev, y_train, y_dev = train_test_split(X, y, train_size = 0.8, random_state = 42)
X_valid, X_test, y_valid, y_test = train_test_split(X_dev, y_dev, train_size = 0.5, random_state = 42)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

In [None]:
def tensor_inputs(tweets, max_len = 200): 
    """
    Converts tweets (str) into tensor inputs for BERT model 

    Inputs: 
    tweets: (str) tweets

    Outputs: 
    input_ids: tweets turned into tensors 
    attention_masks: masks for BERT model 
    """
    input_ids = []
    attention_masks = []
    i = 0
    for tweet in tweets: 
    encoded_dict = tokenizer.encode_plus(
        tweet, 
        add_special_tokens = True, 
        max_length = max_len, 
        pad_to_max_length = True, 
        return_attention_mask = True
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

    input_ids = tf.convert_to_tensor(input_ids)
    attention_masks = tf.convert_to_tensor(attention_masks)

    return input_ids, attention_masks

In [None]:
# Converting training 
train_inp, train_mask = inputs(X_train)
val_inp, val_mask = inputs(X_valid)
train_label = tf.convert_to_tensor(y_train)
val_label = tf.convert_to_tensor(y_valid)

In [None]:
bert = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 2)
bert.cuda() 

In [None]:
dir = '/content/drive/My Drive/Colab Notebooks/data/CS122'
model_save = '/content/drive/My Drive/Colab Notebooks/data/CS122/cs122bert_test.h5'

callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save,
                                                save_weights_only = True,
                                                monitor = 'val_loss',
                                                mode = 'min', 
                                                save_best_only = True), 
             keras.callbacks.TensorBoard(log_dir = dir)]

print('\nBert Model', bert.summary())

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=0.00002, epsilon = 1e-08)

bert.compile(loss=loss, optimizer=optimizer, metrics = [metric])

In [None]:
# Fitting BERT model on training data and validating on validation data
misogynist = bert.fit([train_inp, train_mask],
                      train_label, 
                      batch_size = 32, 
                      epochs = 4, 
                      validation_data = ([val_inp, val_mask], val_label), 
                      callbacks = callbacks)

In [None]:
# Predicting on test set
test_input, _ = tensor_inputs(X_test) 
test_pred = bert.predict(test_input)

In [None]:
# Turning predictions into labels
test_prediction = tf.nn.softmax(test_pred.logits)
test_prediction = tf.argmax(test_prediction, axis=1).numpy()
test_prediction

In [None]:
predictions = pd.DataFrame(test_prediction.T)
predictions

In [None]:
#Loading target data
actual_data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/final_data.csv')
actua_data_big = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/final_data_1.csv')

In [None]:
data_actual = pd.concat([actua_data_big, actual_data])
data_actual = data_actual.drop_duplicates(subset = ['candidate_user_name', 'tweet'])
data_actual.reset_index(inplace=True)

In [None]:
predict_input, _ = tensor_inputs(data_actual['tweet'])

In [None]:
miso_prediction = tf.nn.softmax(test_pred.logits)
miso_prediction = tf.argmax(miso_prediction, axis=1).numpy()
miso_prediction

In [None]:
data_actual['Predicted'] = tf_prediction.T