### Creating a label generator class

Purpose of this Notebook :
*  Reuse the labelling technique for every dataset
*  Easy to test for any extensions/changes/improvements

**Labelling Technique Credits : Harshita**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter
# ensure cleaning.py is in the same directory
# from cleaning import clean_text
import nltk
nltk.download()
from nltk import word_tokenize
from nltk.util import ngrams  
from heapq import nlargest
import collections
import re

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all
    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Package abc is already up-to-date!
       | Downloading package alpino to /root/nltk_data...
       |   Package alpino is already up-to-date!
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Package biocreative_ppi is already up-to-date!
       | Downloading package brown to /root/nltk_data...
       |   Package brown is already up-to-date!
       | Downloading package brown_tei to /root/nltk_data...
       |   Package brown_tei is already up-to-date!
       | Downloading package cess_cat to /root/nltk_data...
       |   Package cess_cat is 

### Labelse generator class. One object of it can be used per dataset to generate labels for it

In [0]:
class label_generator:
  def __init__(self,model,dataset,data_col,no_top_words):
    self.data = dataset
    self.col = data_col
    self.model = model
    self.no_top_words = no_top_words

  def preprocess(self):
    self.clean_text()
    self.all_text = ' '.join(str(word) for word in self.data[self.col].values) 
    self.tokens = word_tokenize(self.all_text)
    self.vectorizer = TfidfVectorizer(max_features = 5000, ngram_range=(1,2))
    self.tf = self.vectorizer.fit_transform(self.data[self.col]).toarray()
    self.tf_feature_names = self.vectorizer.get_feature_names()
  
  def remove_URL(self,text):
    self.url_pattern = re.compile(r'https?://\S+|www\.\S+|pic\.twitter\S+')
    return self.url_pattern.sub(r'', text)


  def get_topics(self):
    self.preprocess()
    self.topic_dict = {}
    self.model.fit(self.tf)
    for topic_idx, topic in enumerate(self.model.components_):
      self.topic_dict["Topic %d words" % (topic_idx)] = ['{}'.format(self.tf_feature_names[i]) 
                                                     for i in topic.argsort()[:-self.no_top_words - 1:-1]]

    return pd.DataFrame(self.topic_dict)

  def clean_text(self):
    # print("Cleaning text and adding column 'processed_text'")
    self.data['processed_text'] = self.data[self.col]
    # Converting to lower case
    self.data['processed_text'] = self.data['processed_text'].str.lower()
    # Removing /n characters
    self.data['processed_text'] = self.data['processed_text'].apply(lambda x: x.replace('\n', ' '))
    # Removing urls
    self.data['processed_text'] = self.data['processed_text'].apply(lambda text: self.remove_URL(text))
    self.col = 'processed_text'
    self.data.drop_duplicates(subset=self.col,inplace=True)
    self.data.reset_index(drop=True,inplace=True)

  def get_binary_labeled_dataset(self):
    self.topics = self.get_topics()
    self.dv_topics = self.topics.columns[0]
    self.other_topics = self.topics.columns[1]
    self.data['label'] = ['']*len(self.data)
    t1list = ['husband','wife','daughter','father','relative','mother','sister',
              'uncle','grandfather','neighbour','parent','child','cousin','inlaw','in-law','boyfriend'
              ,'marital','domestic','liquor','gay','ex','maid','housemaid','partner']

    t2list = ['abuse','violence','domestic','sexual','harass','assault','bitch', \
              'torture','rape','beat','dowry','porn','acidattack','lockdown', \
              'molest','metoo','tiktok','fuck','mental','physical','threat', \
              'blackmail','body','opress','force','bruise','slut','scar', \
              'misogyny','toxic','cheat','verbal','helpline','aware','survey', \
              'initiative','pandemic','commission','work','tiktok','bantiktok', \
              'content','tik','tok','action']

    for i in range(len(self.data)):  
      if any(word in self.data[self.col][i] for word in t1list) and self.data.label[i] == '':
        self.data.label[i] = 'DV'
        
      elif any(word in self.data[self.col][i] for word in t2list) and self.data.label[i] == '':
        self.data.label[i] = 'NO_DV'


    return self.data

In [0]:
def get_labelled_df(PATH,col,number_of_topics,no_of_top_words):
  data = pd.read_csv(PATH)
  model = LatentDirichletAllocation(n_components = number_of_topics, random_state = 42)
  labelizer = label_generator(model=model,dataset=data,data_col=col,no_top_words=no_of_top_words)
  
  return labelizer.get_binary_labeled_dataset(),labelizer.get_topics(),labelizer.topics

## To create labels and get the topics used to create labels, call the function 

    get_labelled_df(PATH_TO_FILE,NO_OF_TOPICS,NO_OF_TOP_WORDS)

*  PATH_TO_FILE : To retrieve the dataset
*  NO_OF_TOPICS : Number of classes we need (In this case 2 i.e, DV and NO_DV)
*  NO_OF_TOP_WORDS : Number of words considered to create each label


In [4]:
df_twitter,_,topics = get_labelled_df("/content/drive/My Drive/Omdena/twitter/more-tweets-extracted.csv",'text',2,20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
df_twitter

Unnamed: 0,user_id,screen_name,tweet_id,tweet_url,timestamp,text,hashtags,has_media,img_urls,video_url,user_location,processed_text,label
0,1139435031204548608,zkwnsaari,1262868325803024389,/zkwnsaari/status/1262868325803024389,2020-05-19 22:10:46,"Cornering men with the phrase ""sexual harassme...",[],0.0,[],,"Perak, Malaysia","cornering men with the phrase ""sexual harassme...",DV
1,1262662724628238338,anuglywoman1,1262856464651243525,/anuglywoman1/status/1262856464651243525,2020-05-19 21:23:38,Sexual harassment..is to make someone fucked u...,[],0.0,[],,Pain,sexual harassment..is to make someone fucked u...,DV
2,15199808,Vidyut,1262838343404044296,/Vidyut/status/1262838343404044296,2020-05-19 20:11:38,Fellow asking about abuse women face online/st...,[],0.0,[],,India,fellow asking about abuse women face online/st...,NO_DV
3,257394747,PramodChturvedi,1262824245517672449,/PramodChturvedi/status/1262824245517672449,2020-05-19 19:15:36,Case registered against employee of private co...,"['AndhraPradesh', 'Krishna']",0.0,[],,"Hyderabad, New Delhi, Ballia",case registered against employee of private co...,DV
4,2247560024,FeminismInIndia,1262816952164196355,/FeminismInIndia/status/1262816952164196355,2020-05-19 18:46:38,Institutional Failures & The Increasing Relian...,[],0.0,[],,India,institutional failures & the increasing relian...,DV
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1690,857761954018058241,satpal13414,1212326306739703808,/satpal13414/status/1212326306739703808,2020-01-01 10:54:50,#स्वर्ण_युग\nGolden Time Is Coming\nThe other ...,['स'],1.0,['https://pbs.twimg.com/media/ENMMVmYU0AAim4b....,,"Rewari, India",#स्वर्ण_युग golden time is coming the other wo...,DV
1691,1156956207640244224,Mohit__solanki,1212256040089075712,/Mohit__solanki/status/1212256040089075712,2020-01-01 06:15:37,Golden Time Is Coming\nThe other woman and gir...,['स'],0.0,[],,"Firozpur, India",golden time is coming the other woman and girl...,DV
1692,838337330322751488,BRAJBHANDAS1234,1212234940441493505,/BRAJBHANDAS1234/status/1212234940441493505,2020-01-01 04:51:46,#स्वर्ण_युग#स्वर्ण_युग\nGolden Time Is Coming\...,"['स', 'स']",1.0,['https://pbs.twimg.com/media/ENK5O_DUwAEnmUY....,,"Gandhidham, India",#स्वर्ण_युग#स्वर्ण_युग golden time is coming t...,DV
1693,972168388490379264,NiteshP82110245,1212212166549331969,/NiteshP82110245/status/1212212166549331969,2020-01-01 03:21:16,#HeavenOnEarth_By_SaintRampalJi\n Time Is Comi...,['HeavenOnEarth_By_SaintRampalJi'],1.0,['https://pbs.twimg.com/media/ENKkdxCVAAAxcxF....,,"मध्य प्रदेश, भारत",#heavenonearth_by_saintrampalji time is comin...,DV


In [6]:
topics

Unnamed: 0,Topic 0 words,Topic 1 words
0,pic,the
1,bantiktokinindia,of
2,bantiktok,to
3,abuse,and
4,attack,sexual
5,acid attack,in
6,of,is
7,sexual,for
8,promoting,harassment
9,acid,on


In [7]:
df_reddit,_,topics = get_labelled_df("/content/drive/My Drive/Omdena/reddit_domestic_violence.csv",'title',2,20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [8]:
df_reddit
df_reddit['label'].value_counts()

         193
NO_DV    134
DV       115
Name: label, dtype: int64

In [9]:
df_extr_twitter,_,topics = get_labelled_df("/content/drive/My Drive/Omdena/twitter/Extracted-data-twitter__.csv",'text',2,20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [10]:
df_extr_twitter['label'].value_counts()

DV       529
          50
NO_DV      6
Name: label, dtype: int64

In [11]:
df_me_too,_,topics = get_labelled_df("/content/drive/My Drive/Omdena/twitter/MeTooIndia.csv",'text',2,20)
df_me_too['label'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


DV       242
         176
NO_DV    166
Name: label, dtype: int64

In [12]:
df_sexual_violence,_,topics = get_labelled_df("/content/drive/My Drive/Omdena/twitter/sexual_violence_twitter.csv",'text',2,20)
df_sexual_violence['label'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


DV    3092
Name: label, dtype: int64

In [0]:
def dv_nodv_frame(df,col):
  return df.loc[df['label']=='DV',[col,'label']],df.loc[df['label']=='NO_DV',[col,'label']]
  

In [0]:
df_reddit['text'] = df_reddit['title']

In [0]:
dv_twitter,nodv_twitter = dv_nodv_frame(df_twitter,'text')
dv_reddit,nodv_reddit = dv_nodv_frame(df_reddit,'text')
dv_extr_twitter,nodv_extr_twitter = dv_nodv_frame(df_extr_twitter,'text')
dv_metoo,nodv_metoo = dv_nodv_frame(df_me_too,'text')
dv_sexual_violence,nodv_sexual_violence = dv_nodv_frame(df_sexual_violence,'text')

labelled_datasets = [dv_twitter,nodv_twitter,dv_reddit,nodv_reddit,dv_extr_twitter,nodv_extr_twitter,
                     dv_metoo,nodv_metoo,dv_sexual_violence,nodv_sexual_violence]

df = pd.concat(labelled_datasets)

In [16]:
df

Unnamed: 0,text,label
0,"Cornering men with the phrase ""sexual harassme...",DV
1,Sexual harassment..is to make someone fucked u...,DV
3,Case registered against employee of private co...,DV
4,Institutional Failures & The Increasing Relian...,DV
5,Cant believe women are going to companies that...,DV
...,...,...
3087,"omg taccodt, someone just ask my sexual orient...",DV
3088,"kalau dari ""gerbang akademisi""nya tegas ya bi...",DV
3089,Tell me again how hanging rapists is supposed ...,DV
3090,The never ending debates about sexual harassme...,DV


#Classifier

In [17]:
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

import os
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
!pip install transformers
import transformers

from tokenizers import BertWordPieceTokenizer

Using TensorFlow backend.
  import pandas.util.testing as tm




In [18]:
try:
    # In case we use TPUs
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [19]:
MAX_LEN = 512
print(MAX_LEN)

512


In [0]:
df['label'] = df['label'].apply(lambda x: 0 if x=='NO_DV' else 1)

In [21]:
df['label']

0       1
1       1
3       1
4       1
5       1
       ..
3087    1
3088    1
3089    1
3090    1
3091    1
Name: label, Length: 5979, dtype: int64

In [0]:
xtrain,xtest,ytrain,ytest = train_test_split(df['text'],df['label'],shuffle=True,test_size=0.3,random_state=42)

In [0]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=MAX_LEN):
    """
    Encoder for encoding the text into sequence of integers for BERT Input
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [0]:
EPOCHS = 3
BATCH_SIZE = 16

In [25]:
# First load the real tokenizer
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')
# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

Tokenizer(vocabulary_size=119547, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=False, wordpieces_prefix=##)

In [26]:
AUTO = tf.data.experimental.AUTOTUNE

x_train = fast_encode(xtrain.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_valid = fast_encode(xtest.astype(str), fast_tokenizer, maxlen=MAX_LEN)

y_train = ytrain
y_valid = ytest


100%|██████████| 17/17 [00:00<00:00, 26.00it/s]
100%|██████████| 8/8 [00:00<00:00, 28.88it/s]


In [27]:
print(x_train)

[[  101 86643 73386 ...     0     0     0]
 [  101 46100   122 ...     0     0     0]
 [  101 45459 11165 ...     0     0     0]
 ...
 [  101   108 22150 ...     0     0     0]
 [  101   152 26134 ...     0     0     0]
 [  101   108 79601 ...     0     0     0]]


In [0]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

In [0]:
def build_model(transformer, max_len=511):
    """
    function for training the BERT model
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [30]:
%%time
with strategy.scope():
    transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-multilingual-cased')
    )
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 512)]             0         
_________________________________________________________________
tf_distil_bert_model (TFDist ((None, 512, 768),)       134734080 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 768)]             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 769       
Total params: 134,734,849
Trainable params: 134,734,849
Non-trainable params: 0
_________________________________________________________________
CPU times: user 4.15 s, sys: 581 ms, total: 4.73 s
Wall time: 5.96 s


In [31]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [49]:
sample = pd.Series(['Emergency! Need to get sister from a abusive home in hyderabad.'])
sample = fast_encode(sample.astype(str), fast_tokenizer, maxlen=MAX_LEN)

100%|██████████| 1/1 [00:00<00:00, 1071.07it/s]


In [50]:
sample

array([[   101,  56308,    106,  30255,  10114,  15329,  19806,  10188,
           169, 104735,  37413,  11816,  10106,  56888,  12015,  24623,
           119,    102,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0, 

In [51]:
model.predict([sample])

array([[0.9778114]], dtype=float32)

In [62]:
sample = pd.Series(['There is a thin line between talking about fear in public spaces and criminalizing working class, Dalit, and poor communities.'])
sample = fast_encode(sample.astype(str), fast_tokenizer, maxlen=MAX_LEN)
model.predict([sample])

100%|██████████| 1/1 [00:00<00:00, 169.89it/s]


array([[0.10334904]], dtype=float32)

First text is of class DV with a probability of 0.97
Second one is DV related with a probability 0.103 Hence, it's NON_DV