*Note: Turn on GPU for this notebook.*

## Dependencies and Libraries

In [None]:
# download the required pacakges
!pip install tensorflow_text

In [2]:
# import the libraries
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd

## Load the data

In [5]:
# Load the data
### data_path is the path of the csv file

data_path = "/content/drive/Shareddrives/team MAMI/MAMI/TRAIN/training.csv"
df = pd.read_csv(data_path, delimiter="\t")
df.head(5)

Unnamed: 0,file_name,misogynous,shaming,stereotype,objectification,violence,Text Transcription
0,1.jpg,0,0,0,0,0,Milk Milk.zip
1,10.jpg,1,0,0,0,1,"ROSES ARE RED, VIOLETS ARE BLUE IF YOU DON'T S..."
2,1000.jpg,0,0,0,0,0,BREAKING NEWS: Russia releases photo of DONALD...
3,10000.jpg,0,0,0,0,0,MAN SEEKING WOMAN Ignad 18 O
4,10006.jpg,0,0,0,0,0,Me explaining the deep lore of. J.R.R. Tolkein...


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Data Summary and Preprocessing

In [6]:
# checking if the classes are balanced
df['misogynous'].value_counts()

1    5000
0    5000
Name: misogynous, dtype: int64

The classes are evenly distributed (i.e. balanced class).

In [7]:
# splitting the data into train and test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Text Transcription'],df['misogynous'], stratify=df['misogynous'], test_size=0.25)

## BERT Implementation

In [8]:
# downloading the pre-trained BERT model from tfhub
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

Functional model below.

In [9]:
# initializing BERT layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text') # input layer
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# initializing NN layers

### The Dropout layer randomly sets input units to 0 with a frequency of rate at 
### each step during training time, which helps prevent overfitting
### Source: https://keras.io/api/layers/regularization_layers/dropout/
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output']) # 0.1% neuron is dropped out randomly
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l) # 1 neuron tells if the text is misogynous or not (i.e. 1 or 0)

model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [10]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

## Model Training

In [11]:
# accuracy metrics
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [12]:
model.fit(X_train, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f89f2fa8810>

In [13]:
model.evaluate(X_test, y_test)



[0.5654698014259338,
 0.7188000082969666,
 0.7145097851753235,
 0.7287999987602234]