<a href="https://colab.research.google.com/github/Rhitabrat/MAMI/blob/main/Text%20Embedding/bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*Note: Turn on GPU for this notebook.*

## Dependencies and Libraries

In [2]:
# download the required pacakges
!pip install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.7.3-cp37-cp37m-manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 6.9 MB/s 
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.7.3


In [3]:
# import the libraries
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd

## Load the data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Load the data
### data_path is the path of the csv file

data_path = "/content/drive/MyDrive/PSU/NLP Lab/MAMI/TRAIN/CSVs/training_original.csv"
df = pd.read_csv(data_path, delimiter="\t")

# sort the data by file_name
def remove_file_extension(file_name):
    return(int(file_name[:-4]))

df["filename"] = df.file_name
df.filename = df.filename.map(remove_file_extension)
df = df.sort_values('filename')
df = df.drop('file_name', 1)

df.head(5)

Unnamed: 0,misogynous,shaming,stereotype,objectification,violence,Text Transcription,filename
0,0,0,0,0,0,Milk Milk.zip,1
2557,0,0,0,0,0,-What are you doing? -you told me to satanize ...,2
3458,0,0,0,0,0,imgflip.com ME 1254 NEW BUGS AFTER CHANGES BUG...,3
4360,0,0,0,0,0,Bedroom Kitchen Bathroom Bron memes storage,4
5311,0,0,0,0,0,WAKEUP EARLY FREELANCERS,5


In [6]:
# caption_data_path = "/content/drive/Shareddrives/team_MAMI/MAMI/TRAIN/CSVs/captions_mscoco_trained.csv"
# df_caption = pd.read_csv(caption_data_path, header=None, names=["caption"])
# # df_caption = pd.read_csv(caption_data_path, header=None, names=["image_name", "caption"])
# # df_caption = df_caption.sort_values('image_name')
# df_caption

In [7]:
# cap = df_caption.caption.to_list()
# df["caption"] = cap

In [8]:
# df_req = df[['Text Transcription', 'caption']].copy()
# df_req

In [9]:
# concatenated = df_req['Text Transcription'] + df_req['caption']

## Data Summary and Preprocessing

In [10]:
# checking if the classes are balanced
df['misogynous'].value_counts()

1    5000
0    5000
Name: misogynous, dtype: int64

The classes are evenly distributed (i.e. balanced class).

In [11]:
# splitting the data into train and test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Text Transcription'], df['misogynous'], stratify=df['misogynous'], test_size=0.25)

## BERT Implementation

In [6]:
# downloading the pre-trained BERT model from tfhub
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

Functional model below.

In [41]:
# initializing BERT layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text') # input layer
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# initializing NN layers

### The Dropout layer randomly sets input units to 0 with a frequency of rate at 
### each step during training time, which helps prevent overfitting
### Source: https://keras.io/api/layers/regularization_layers/dropout/
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output']) # 0.1% neuron is dropped out randomly
l = tf.keras.layers.Dense(100, activation='tanh')(l)
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l) # 1 neuron tells if the text is misogynous or not (i.e. 1 or 0)

model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [42]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                    

## Model Training

In [43]:
# # accuracy metrics
# METRICS = [
#       tf.keras.metrics.BinaryAccuracy(name='accuracy'),
#       tf.keras.metrics.Precision(name='precision'),
#       tf.keras.metrics.Recall(name='recall')
# ]

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001, name='Adam'),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [45]:
model.fit(df['Text Transcription'], df['misogynous'], epochs=20, batch_size=32, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f6479e67890>

In [None]:
model.evaluate(X_test, y_test)

In [25]:
# paths
csv_path_test = '/content/drive/MyDrive/PSU/NLP Lab/MAMI/TEST/CSV/Test.csv'

#load data
test_df = pd.read_csv(csv_path_test, sep='\t')

In [46]:
predictions = model.predict(test_df['Text Transcription'], batch_size=32)
predictions = predictions.reshape(predictions.shape[0])
pred = predictions >= 0.5
pred = list(map(int, pred)) #true/false to 1/0

predictions_db = pd.DataFrame(data=test_df['file_name'])
predictions_db['misogynist'] = pred

In [47]:
predictions_db

Unnamed: 0,file_name,misogynist
0,15236.jpg,1
1,15805.jpg,1
2,16254.jpg,0
3,16191.jpg,1
4,15952.jpg,1
...,...,...
995,15591.jpg,1
996,15049.jpg,1
997,15363.jpg,1
998,15199.jpg,1


In [48]:
predictions_db.to_csv("/content/drive/MyDrive/PSU/NLP Lab/MAMI/answer_1.txt", index=False, sep='\t', header=False)

In [39]:
test_pred = (predictions_db.misogynist * neg)

pred = test_pred >= 0.5
pred = list(map(int, pred)) #true/false to 1/0

predictions_db = pd.DataFrame(data=test_df['file_name'])
predictions_db['misogynist'] = pred
predictions_db

Unnamed: 0,file_name,misogynist
0,15236.jpg,0
1,15805.jpg,1
2,16254.jpg,0
3,16191.jpg,0
4,15952.jpg,0
...,...,...
995,15591.jpg,1
996,15049.jpg,0
997,15363.jpg,0
998,15199.jpg,0


In [None]:
!pip install transformers

In [29]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
 
 
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
sent_model = AutoModelForSequenceClassification.from_pretrained(MODEL)
# sent_model.save_pretrained(MODEL)



Downloading:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

In [36]:
text = "Good night 😊"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = sent_model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores

array([0.00760988, 0.1458123 , 0.84657794], dtype=float32)

In [30]:
df_scores = []
for text in test_df['Text Transcription']:
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = sent_model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    df_scores.append(scores)

In [31]:
neu=[]
neg=[]
pos=[]

for each in df_scores:
    neg.append(each[0])
    neu.append(each[1])
    pos.append(each[2])