<a href="https://colab.research.google.com/github/TeamMAMI/MAMI/blob/main/Text%20Embedding/bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*Note: Turn on GPU for this notebook.*

## Dependencies and Libraries

In [None]:
# download the required pacakges
!pip install tensorflow_text

In [2]:
# import the libraries
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd

## Load the data

In [3]:
from google.colab import drive
drive._mount('/content/drive')

Mounted at /content/drive


In [14]:
# Load the data
### data_path is the path of the csv file

data_path = "/content/drive/Shareddrives/team_MAMI/MAMI/TRAIN/CSVs/training_original.csv"
df = pd.read_csv(data_path, delimiter="\t")

# sort the data by file_name
def remove_file_extension(file_name):
    return(int(file_name[:-4]))

df["filename"] = df.file_name
df.filename = df.filename.map(remove_file_extension)
df = df.sort_values('filename')
df = df.drop('file_name', 1)

df.head(5)

Unnamed: 0,misogynous,shaming,stereotype,objectification,violence,Text Transcription,filename
0,0,0,0,0,0,Milk Milk.zip,1
2557,0,0,0,0,0,-What are you doing? -you told me to satanize ...,2
3458,0,0,0,0,0,imgflip.com ME 1254 NEW BUGS AFTER CHANGES BUG...,3
4360,0,0,0,0,0,Bedroom Kitchen Bathroom Bron memes storage,4
5311,0,0,0,0,0,WAKEUP EARLY FREELANCERS,5


In [15]:
caption_data_path = "/content/drive/Shareddrives/team_MAMI/MAMI/TRAIN/CSVs/captions_mscoco_trained.csv"
df_caption = pd.read_csv(caption_data_path, header=None, names=["caption"])
# df_caption = pd.read_csv(caption_data_path, header=None, names=["image_name", "caption"])
# df_caption = df_caption.sort_values('image_name')
df_caption

Unnamed: 0,caption
0,box with a baked donuts sitting on a bed
1,a person dressed up as he is sitting on a skat...
2,two men that has its way with a pile of plants...
3,a picture of a picture of a picture with a bla...
4,a man and a chocolate and blue utensils
...,...
9995,this is a brown cat standing alone in the day
9996,a street smiling as he stands next to each other
9997,a young boy sitting on a skateboard
9998,there is a kitchen oven with a cone


In [16]:
cap = df_caption.caption.to_list()
df["caption"] = cap

In [17]:
df_req = df[['Text Transcription', 'caption']].copy()
df_req

Unnamed: 0,Text Transcription,caption
0,Milk Milk.zip,box with a baked donuts sitting on a bed
2557,-What are you doing? -you told me to satanize ...,a person dressed up as he is sitting on a skat...
3458,imgflip.com ME 1254 NEW BUGS AFTER CHANGES BUG...,two men that has its way with a pile of plants...
4360,Bedroom Kitchen Bathroom Bron memes storage,a picture of a picture of a picture with a bla...
5311,WAKEUP EARLY FREELANCERS,a man and a chocolate and blue utensils
...,...,...
9995,WAITING FOR THE END OF THE COVID imgflip.com,this is a brown cat standing alone in the day
9996,SMART WOMEN ARE AROUND imgflip.com,a street smiling as he stands next to each other
9997,GOOD GIRLS ARE BEHIND THE CORNER imgflip.com,a young boy sitting on a skateboard
9998,COOKING FOR MY WIFE imgflip.com,there is a kitchen oven with a cone


In [18]:
concatenated = df_req['Text Transcription'] + df_req['caption']

## Data Summary and Preprocessing

In [19]:
# checking if the classes are balanced
df['misogynous'].value_counts()

1    5000
0    5000
Name: misogynous, dtype: int64

The classes are evenly distributed (i.e. balanced class).

In [20]:
# splitting the data into train and test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(concatenated, df['misogynous'], stratify=df['misogynous'], test_size=0.25)

## BERT Implementation

In [21]:
# downloading the pre-trained BERT model from tfhub
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

Functional model below.

In [22]:
# initializing BERT layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text') # input layer
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# initializing NN layers

### The Dropout layer randomly sets input units to 0 with a frequency of rate at 
### each step during training time, which helps prevent overfitting
### Source: https://keras.io/api/layers/regularization_layers/dropout/
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output']) # 0.1% neuron is dropped out randomly
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l) # 1 neuron tells if the text is misogynous or not (i.e. 1 or 0)

model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [23]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

## Model Training

In [24]:
# accuracy metrics
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [25]:
model.fit(X_train , y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3b519fc110>

In [None]:
model.evaluate(X_test, y_test)



[0.6802049875259399, 0.6000000238418579, 1.0, 0.09090909361839294]