# model_view & head_view

In [None]:
import sys
!test -d bertviz_repo && echo "FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo"
# !rm -r bertviz_repo # Uncomment if you need a clean pull from repo
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path:
  sys.path += ['bertviz_repo']
!pip install regex
!pip install transformers

from bertviz import model_view
from bertviz import head_view
from transformers import BertTokenizer, BertModel

In [2]:
def call_html():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/5.7.0/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))

In [3]:
model_version = 'sagorsarker/bangla-bert-base'
do_lower_case = True
model = BertModel.from_pretrained(model_version, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)
sentence_a = "আমি বাংলায় গান গাই।"
sentence_b = "সে বাজারে যায়।"
inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='pt', add_special_tokens=True)
token_type_ids = inputs['token_type_ids']
input_ids = inputs['input_ids']
attention = model(input_ids, token_type_ids=token_type_ids)[-1]
input_id_list = input_ids[0].tolist() # Batch index 0
tokens = tokenizer.convert_ids_to_tokens(input_id_list)
call_html()

Downloading:   0%|          | 0.00/491 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/660M [00:00<?, ?B/s]

Some weights of the model checkpoint at sagorsarker/bangla-bert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

In [4]:
model_view(attention, tokens)

<IPython.core.display.Javascript object>

In [8]:
head_view(attention, tokens)

<IPython.core.display.Javascript object>

# Word filling

In [None]:
!pip install transformers

In [10]:
from transformers import BertForMaskedLM, BertTokenizer, pipeline

model = BertForMaskedLM.from_pretrained("sagorsarker/bangla-bert-base")
tokenizer = BertTokenizer.from_pretrained("sagorsarker/bangla-bert-base")
nlp = pipeline('fill-mask', model=model, tokenizer=tokenizer)

Some weights of the model checkpoint at sagorsarker/bangla-bert-base were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
for pred in nlp(f"আমি বাংলায় {nlp.tokenizer.mask_token} গাই।"):
  print(pred)

{'sequence': 'আমি বাংলায গান গাই ।', 'score': 0.13404710590839386, 'token': 2552, 'token_str': 'গ া ন'}
{'sequence': 'আমি বাংলাযও গাই ।', 'score': 0.061385899782180786, 'token': 2058, 'token_str': '# # ও'}
{'sequence': 'আমি বাংলায সাহিত্য গাই ।', 'score': 0.04723009467124939, 'token': 4122, 'token_str': 'স া হ ি ত ্ য'}
{'sequence': 'আমি বাংলায কবিতা গাই ।', 'score': 0.04314074665307999, 'token': 4459, 'token_str': 'ক ব ি ত া'}
{'sequence': 'আমি বাংলায শব্দ গাই ।', 'score': 0.013253462500870228, 'token': 3264, 'token_str': 'শ ব ্ দ'}


## Bangla BERT Tokenization

In [12]:
from transformers import AutoTokenizer, AutoModel

bnbert_tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")
text = "আমি বাংলায় গান গাই।"
bnbert_tokenizer.tokenize(text)

['আমি', 'বাংলা', '##য', 'গান', 'গাই', '।']

# Sentiment Analysis


## Dependency

In [None]:
!pip install simpletransformers

## data load and train evalutino split

In [28]:
# data prepare
import pandas as pd
input_path = '/content/emotional_data.csv'
df = pd.read_csv(input_path)
df = df.drop(df.columns[[0]], axis=1)  # delete the 1st column for my dataset
df = df.sample(frac=1)
df.head()

Unnamed: 0,text,labels
841,বাস্তুল দূর্গ নিয়ে ভিডিও হলে ভালো হইতো,0
950,গ্রুপে চার ম্যাচ খেলে ৬ পয়েন্ট নিয়ে শীর্ষেই ...,0
201,অর্থাৎ জিততে পারলে সে তিনগুন পাবে,4
322,একে এক নতুন আত্মসমর্পণ বলা যায়,1
139,"ভাল করে খেয়ে এসো কিন্তু,,,",5


In [29]:
from sklearn.model_selection import train_test_split

train, eval = train_test_split(df,test_size=0.2)

In [30]:
train.head()

Unnamed: 0,text,labels
695,নতুন রূপে নোকিয়া ৩৩১০,4
989,আপনার ভিডিও দেখে সাত কেজি ওজন কমেছে ১ মাসে,0
257,আল্লাহ রাব্বুল আলামীন আমাদের রহম করুন আমীন,0
666,আপনার চুল গুলো সুন্নত তরিকায় হলে ভালো হতো,1
699,একেই বোধহয় বলে রাজকপাল!,0


In [31]:
eval.head()

Unnamed: 0,text,labels
898,"সেলার ভালো,প্রোডাক্ট ভালো।",0
441,রাবিতে বঙ্গবন্ধুর জন্মবার্ষিকী ও জাতীয় শিশুদিব...,5
856,দ্রুত পরিক্ষা নেয়া হোক পারলে দুই-তিন বার নেয়া ...,0
880,"স্যার, সিপিবি এর তাদের ইশতেহারটি চার পষ্ঠার না...",0
122,বয়োম এর চেয়ে এটা কিনলে সাশ্রয়ী এবং চামচ ও ফ্রি...,0


## Bangla BERT

In [None]:
import pandas as pd

from simpletransformers.classification import ClassificationModel


# Create a ClassificationModel
model = ClassificationModel("bert", "sagorsarker/bangla-bert-base", num_labels= 6, args={"num_train_epoch": 1, "reprocess_input_data": False, "overwrite_output_dir": True})

# Train the model
model.train_model(train)

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(eval)


In [None]:
print(result)

## model predection

In [None]:
model.predict("এতো সুন্দর প্রতিবেদন মনে হয় খালি দেখেই যাই")