## **Load the Drive helper and mount**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Importing packages**

In [None]:
import json
import pandas as pd
from tqdm import tqdm
import pickle

## **Installing required package for data augmentation**
Refer: https://github.com/sagorbrur/bnaug

In [None]:
!pip install bnaug

Collecting bnaug
  Downloading bnaug-1.1.1-py3-none-any.whl (4.8 kB)
Collecting bnlp-toolkit (from bnaug)
  Downloading bnlp_toolkit-3.3.2-py3-none-any.whl (23 kB)
Collecting transformers==4.24.0 (from bnaug)
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0 (from transformers==4.24.0->bnaug)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.24.0->bnaug)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from bnlp-toolkit->bnaug)
 

## **Reading train data**

In [None]:
train_data = pd.read_csv('Data/train.csv')

## **Given Label to ID mapping**

In [None]:
'''
Direct Violence	2
Passive Violence	1
Non-Violence	0
'''

## **Checking the class distribution in train data**

In [None]:
train_data['label'].value_counts()

0    1389
1     922
2     389
Name: label, dtype: int64

In [None]:
sent0 = train_data[train_data['label'] == 0]['text'].tolist()
print(len(sent0))

1389


## **Checking data augmentation on a sample sentence**

In [None]:
from bnaug.sentence import TextGeneration

tg = TextGeneration()
text = "বিমানটি যখন মাটিতে নামার জন্য এয়ারপোর্টের কাছাকাছি আসছে, তখন ল্যান্ডিং গিয়ারের খোপের ঢাকনাটি খুলে যায়।"
output = tg.parapharse_generation(text)
print(output)

Downloading (…)lve/main/config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.15k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]



[' প্লেনটা এয়ারপোর্টের কাছে অবতরণ করার সময়, ল্যান্ডিং গিয়ার প্যানেলের ঢাকনাটা খুলে দেওয়া']


In [None]:
def create_augmented_data(sent, count):
    augmented_sent = []
    for ii in tqdm(range(count)):
        t = tg.parapharse_generation(sent[ii])
        augmented_sent.append(t)

    return augmented_sent

## **Creating augmented data for each class label**

In [None]:
augmented_sent0 = create_augmented_data(sent0, 500)
augmented_sent1 = create_augmented_data(sent1, 500)
augmented_sent2 = create_augmented_data(sent2, 389)

## **Saving augmented data for each class into pickle files**

In [None]:
with open('Data/Augmented/category0_augmented.pickle', 'wb') as f:
    pickle.dump(augmented_sent0, f)

with open('Data/Augmented/category0_augmented.pickle', 'rb') as f:
    b = pickle.load(f)
    print(augmented_sent0 == b)


with open('Data/Augmented/category1_augmented.pickle', 'wb') as f:
    pickle.dump(augmented_sent1, f)

with open('Data/Augmented/category1_augmented.pickle', 'rb') as f:
    c = pickle.load(f)
    print(augmented_sent1 == c)


with open('Data/Augmented/category2_augmented.pickle', 'wb') as f:
    pickle.dump(augmented_sent2, f)

with open('Data/Augmented/category2_augmented.pickle', 'rb') as f:
    d = pickle.load(f)
    print(augmented_sent2 == d)