<a href="https://colab.research.google.com/github/Tanisha2626/Harasment_level_detection/blob/main/Harasment_level_Detection_Hindi_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/huggingface/transformers
!pip install /content/transformers/
!pip install sentencepiece

In [None]:
import numpy as np
np.random.seed(42)
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

### TPU set-up

In [None]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

### Downloading Datasets

In [None]:
!git clone https://github.com/Tanisha2626/Harasment_level_detection

In [None]:
raw_train_df =  pd.read_csv("Harasment_level_detection/trac2_hin_train.csv")
raw_train_df['split'] = 'train'
print(raw_train_df.columns)
print(raw_train_df['Sub-task A'].value_counts())
print(raw_train_df['Sub-task B'].value_counts())
print(f"Size of 'train' split: {len(raw_train_df)}")

In [None]:
raw_dev_df =  pd.read_csv("Harasment_level_detection/trac2_hin_dev.csv")
raw_dev_df['split'] = 'dev'
print(raw_dev_df.columns)
print(raw_dev_df['Sub-task A'].value_counts())
print(raw_dev_df['Sub-task B'].value_counts())
print(f"Size of 'dev' split: {len(raw_dev_df)}")

In [None]:
test_df =  pd.read_csv("Harasment_level_detection/trac2_hin_test.csv")
test_df

In [None]:
# Concatinate both train and dev dfs together
data_df = pd.concat([raw_dev_df, raw_train_df], ignore_index= True)
data_df

### Samples given per label size

In [None]:
print(f'Total dev + train size = {len(data_df)}\n')
print(data_df['Sub-task A'].value_counts(),'\n')
print(data_df['Sub-task B'].value_counts(),'\n')

### Label emcoder for Sub-task A

In [None]:
task_a_label_dict = {'NAG':0, 'OAG':1, 'CAG':2}
print(task_a_label_dict)

In [None]:
data_df_task_a = data_df[['ID','Text','Sub-task A','split']].copy()
data_df_task_a.columns.values[1] = 'text'
data_df_task_a.columns.values[2] = 'label'
data_df_task_a.loc[:,'label'] = data_df_task_a.loc[:,'label'].map(task_a_label_dict)
data_df_task_a

In [None]:
print("Num samples per class")
print(data_df_task_a.label.value_counts())

print("\nNum samples per split")
print(data_df_task_a.split.value_counts())

print("\nLabel counts in dev split")
print(data_df_task_a[data_df_task_a.split=='dev'].label.value_counts())

print("\nLabel counts in train split")
print(data_df_task_a[data_df_task_a.split=='train'].label.value_counts())

### Label encoder for Sub-task B

In [None]:
task_b_label_dict = {'NGEN':0, 'GEN':1}
print(task_b_label_dict)

In [None]:

data_df_task_b = data_df[['ID','Text','Sub-task B','split']].copy()
data_df_task_b.columns.values[1] = 'text'
data_df_task_b.columns.values[2] = 'label'
data_df_task_b.loc[:,'label'] = data_df_task_b.loc[:,'label'].map(task_b_label_dict)
data_df_task_b

In [None]:
print("Num samples per class")
print(data_df_task_b.label.value_counts())

print("\nNum samples per split")
print(data_df_task_b.split.value_counts())

print("\nLabel counts in dev split")
print(data_df_task_b[data_df_task_b.split=='dev'].label.value_counts())

print("\nLabel counts in train split")
print(data_df_task_b[data_df_task_b.split=='train'].label.value_counts())

# Sub-task B

In [None]:
gb = data_df_task_b.groupby('split')
grps = [gb.get_group(x) for x in gb.groups]

In [None]:
grps[1]

Unnamed: 0,ID,text,label,split
997,C4.131,Bollywood film dekhne ke samay logic ghar mein...,0,train
998,C4.638,Chutiya movie...,0,train
999,C38.598,Us jaat bnde ka khene ka matlab tha mar daluga...,0,train
1000,C4.2101.1,@Feminism Is CANCER *un feminist yeh sahi hai ...,0,train
1001,C29.14.2,Amrit Anand अब तो जुड़े ही है उनको बोलो जुड़ने,0,train
...,...,...,...,...
4976,C38.455,Asexual h.. bisexual... homosexual... bhai ase...,0,train
4977,C4.203,Video pura dekne ke pahile hi mai bhai ke vide...,0,train
4978,C45.709,konsa place hai bhai ...nam bolo,0,train
4979,C4.420.1,Kuch zada hi likh diya 🙄,0,train


### Solving class balance problem in Gender target classification dataset

In [None]:
train_data = grps[1]
gb = train_data.groupby('label')
train_grps = [gb.get_group(x) for x in gb.groups]

class_0_sample = grps[0].sample(n=600)

frames = train_grps[1:2]+[class_0_sample]
result=pd.concat(frames)

In [None]:
result

Unnamed: 0,ID,text,label,split
1005,C4.974,Lit sir🔥bhenco desh bhakti mai bhi item song p...,1,train
1006,C36.1070,"आर्मी में हमे मर्द चाहिए,हिजड़े नही ।",1,train
1024,C33.485,Ye chutiya ladki kon h be ... ... kuch v baak ...,1,train
1036,C4.694.1,@KHAN SHAHAB abey jhaatu overseas ki baat mat ...,1,train
1041,C7.1756,Abe bc ye kis angle se movie review that Bhai ...,1,train
...,...,...,...,...
780,C4.175,Gyaan mt chod chl,1,dev
167,C4.1663,Kabir Singh se sirf or sirf 1 seekh milti h \...,0,dev
112,C4.1996.2,@Fact Park bhai 50 minutes se jyada tak kuch n...,0,dev
877,C19.169.2,Review ke liye sirf Mubarak,0,dev


### Model for sub-task b

In [None]:
def build_model_task_b(transformer, num_classes, max_len=512):

    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(num_classes, activation='softmax')(cls_token)
    # X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1))(sequence_output)
    # X = tf.keras.layers.GlobalMaxPool1D()(X)
    # X = tf.keras.layers.Dense(50, activation='relu')(X)
    # X = tf.keras.layers.Dropout(0.2)(X)
    # out = tf.keras.layers.Dense(num_classes, activation='softmax')(X)
    model = Model(inputs=input_word_ids, outputs=out)
    #model.layers[1].trainable = False
    model.compile(Adam(lr=1e-5), loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy'])

    return model

### Hyperparameters

In [None]:
EPOCHS = 2
BATCH_SIZE = 16 #* strategy.num_replicas_in_sync
MAX_LEN = 128
MODEL = 'jplu/tf-xlm-roberta-base'

### Tokenizer

In [None]:
from transformers import XLMRobertaTokenizer

tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base', do_lower_case=True, add_special_tokens=True,max_length=512, pad_to_max_length=True)

In [None]:
x_train = np.asarray(tokenizer.batch_encode_plus(result['text'],pad_to_max_length=True, max_length=128, return_attention_mask=False)['input_ids'])

In [None]:
x_train.shape

In [None]:
y_train = result.label

In [None]:
y_train = np.asarray(y_train, dtype=np.float32)

In [None]:
x_val = np.asarray(tokenizer.batch_encode_plus(grps[0]['text'],pad_to_max_length=True, max_length=128, return_attention_mask=False)['input_ids'])

In [None]:
x_val.shape

(997, 128)

In [None]:
y_val = grps[0].label

In [None]:
y_val = np.asarray(y_val, dtype=np.float32)

In [None]:
x_test = np.asarray(tokenizer.batch_encode_plus(test_df['Text'],pad_to_max_length=True, max_length=128, return_attention_mask=False)['input_ids'])

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

val_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_val, y_val))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

## Training

In [None]:
# Train on CPU/GPU

transformer_layer = TFAutoModel.from_pretrained(MODEL)
model_task_b = build_model_task_b(transformer_layer,num_classes=2, max_len=MAX_LEN)
model.summary()

n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=val_dataset,
    epochs=EPOCHS
)

In [None]:
# Train on TPU

with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model_task_b = build_model_task_b(transformer_layer,num_classes=2, max_len=MAX_LEN)
model_task_b.summary()

n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model_task_b.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=val_dataset,
    epochs=100
)

In [None]:
y_pred = np.argmax(model_task_b.predict(x_val),axis=1)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

print("Accuracy score =",accuracy_score(y_pred,y_val))
print("Precision score =",precision_score(y_pred,y_val))
print("Recall score =",recall_score(y_pred,y_val))
print("F1 score =",f1_score(y_pred,y_val))
print(confusion_matrix(y_pred,y_val))

## Testing function for sub-task b

In [None]:
import warnings
warnings.filterwarnings("ignore")
def gender_bias(sentence):
  text = [sentence]
  df_input = pd.DataFrame(text)
  x_input = np.asarray(tokenizer.batch_encode_plus(df_input[0],pad_to_max_length=True, max_length=128, return_attention_mask=False)['input_ids'])
  class_probs = model_task_b.predict(x_input)
  pred_class = np.argmax(class_probs,axis=1)[0]
  gender = list(task_b_label_dict.keys())[pred_class]
  return gender, class_probs[0]

## GUI input for sub-task b

In [None]:
#@title Is this comment gender targeted?
sentence = "Woh bohot bada gadha hai" #@param {type:"string"}
gender, class_probs = gender_bias(sentence)
print("Is it Gender Targeted ? = ",gender)
print("Not Gender Targeted probability =",class_probs[0])
print("Gender Targeted probability =",class_probs[1])

# Sub-Task-A

In [None]:
gb = data_df_task_a.groupby('split')
grps = [gb.get_group(x) for x in gb.groups]

In [None]:
x_train = np.asarray(tokenizer.batch_encode_plus(grps[1]['text'],pad_to_max_length=True, max_length=128, return_attention_mask=False)['input_ids'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
y_train = grps[1].label

In [None]:
y_train = np.asarray(y_train, dtype=np.float32)

In [None]:
x_val = np.asarray(tokenizer.batch_encode_plus(grps[0]['text'],pad_to_max_length=True, max_length=128, return_attention_mask=False)['input_ids'])

In [None]:
y_val = grps[0].label

In [None]:
y_val = np.asarray(y_val, dtype=np.float32)

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

val_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_val, y_val))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

## Model for sub-task A

In [None]:
def build_model_task_a(transformer, num_classes, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    #cls_token = sequence_output[:, 0, :]
    #out = Dense(num_classes, activation='softmax')(cls_token)
    X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1))(sequence_output)
    X = tf.keras.layers.GlobalMaxPool1D()(X)
    X = tf.keras.layers.Dense(50, activation='relu')(X)
    X = tf.keras.layers.Dropout(0.2)(X)
    out = tf.keras.layers.Dense(num_classes, activation='softmax')(X)
    model = Model(inputs=input_word_ids, outputs=out)
    #model.layers[1].trainable = False
    model.compile(Adam(lr=1e-5), loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy'])

    return model


## Training

In [None]:
# Train on CPU/GPU

transformer_layer = TFAutoModel.from_pretrained(MODEL)
model_task_a = build_model_task_a(transformer_layer,num_classes=3, max_len=MAX_LEN)
model_task_a.summary()

n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model_task_a.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=val_dataset,
    epochs=20
)

In [None]:
# Train on TPU

with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model_task_a = build_model_task_a(transformer_layer, num_classes=3, max_len=MAX_LEN)
model_task_a.summary()

n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model_task_a.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=val_dataset,
    epochs=2
)

In [None]:
y_pred = np.argmax(model_task_a.predict(x_val),axis=1)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

print("Accuracy score =",accuracy_score(y_pred,y_val))
print("Precision score =",precision_score(y_pred,y_val,average='macro'))
print("Recall score =",recall_score(y_pred,y_val,average='macro'))
print("F1 score =",f1_score(y_pred,y_val,average='macro'))
print(confusion_matrix(y_pred,y_val))

## Testing function for sub-task A

In [None]:
import warnings
warnings.filterwarnings("ignore")
def agression(sentence):
  text = [sentence]
  df_input = pd.DataFrame(text)
  x_input = np.asarray(tokenizer.batch_encode_plus(df_input[0],pad_to_max_length=True, max_length=128, return_attention_mask=False)['input_ids'])
  class_probs = model_task_a.predict(x_input)
  pred_class = np.argmax(class_probs,axis=1)[0]
  agression = list(task_a_label_dict.keys())[pred_class]
  return agression, class_probs[0]

## GUI input for sub-task A

In [None]:
#@title Agression Level
sentence = "Yaha aaoge to sar yehi rakh ke jaana padega" #@param {type:"string"}
agr, class_probs = agression(sentence)
print("Agression level = ",agr)
print("NAG probability =",class_probs[0])
print("OAG probability =",class_probs[1])
print("CAG probability =",class_probs[2])

Agression level =  OAG
NAG probability = 0.30922744
OAG probability = 0.35746512
CAG probability = 0.33330742
