**Importing Libraries**
---


In [8]:
# !unzip 'Data.zip'
# !pip install hazm

In [9]:
import os
import json
import pandas as pd
import numpy as np

from transformers import glue_convert_examples_to_features, TFBertForSequenceClassification
import tensorflow as tf
from transformers import BertTokenizer, BertForSequenceClassification, AutoConfig, AutoTokenizer, TFAutoModel, BertConfig

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from tqdm import tqdm
tqdm.pandas()

import re
from hazm import *

**Make a CSV dataset fram folders**
---

In [3]:
list_of_text = []

for auth in os.listdir('Data'):
    if auth == '.DS_Store':
        continue
    for file in os.listdir('Data/'+auth):
        if file == '.DS_Store':
            continue
        with open(os.path.join('Data/'+auth, file)) as f:
            text = f.read()
        list_of_text.append((text, auth))

df = pd.DataFrame(list_of_text, columns = ['Rev', 'Auth'])
df.to_csv('rev.csv', index_label=False)

In [4]:
df.size

600

**Preprocess functions**
---

In [5]:
def rm_link(text):
    """
    Removes hyperlinks from the text.
    """
    return re.sub(r'http\S+', '', text)

def rm_punct2(text):
    """
    Removes punctuation marks from the text.
    """
    return re.sub(r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]', ' ', text)

def rm_html(text):
    """
    Removes HTML tags from the text.
    """
    text = re.sub(r'<.*?>', '', text)
    return re.sub(r'<br />', '', text)

def space_bt_punct(text):
    """
    Adds spaces between punctuation marks and words.
    """
    pattern = r'([.,!?-])'
    s = re.sub(pattern, r' \1 ', text)
    s = re.sub(r'\s{2,}', ' ', s)
    return s

def rm_number(text):
    """
    Removes numbers from the text.
    """
    return re.sub(r'\d+', '', text)

def rm_whitespaces(text):
    """
    Removes extra whitespaces from the text.
    """
    return re.sub(r'\s+', ' ', text)

def rm_nonascii(text):
    """
    Removes non-ASCII characters from the text
    """
    return re.sub(r'[a-zA-Z!,*)@#%(&$_?.]', r'', text)

def spell_correction(text):
    """
    Used for checking the correct spelling given a text
    """
    return text
    spell = SpellChecker()
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            candidate = spell.correction(word)
            if candidate is not None:
                corrected_text.append(candidate)
            else:
                corrected_text.append(word)
        else:
            corrected_text.append(word)
    return ' '.join(corrected_text)

def clean_pipeline(text):
    """
    Combines all the above preprocessing steps into a single pipeline and produces the final processed result given an input text.
    """
    text = text.lower()
    no_link = rm_link(text)
    no_html = rm_html(no_link)
    space_punct = space_bt_punct(no_html)
    no_punct = rm_punct2(space_punct)
    no_number = rm_number(no_punct)
    no_whitespaces = rm_whitespaces(no_number)
    no_nonasci = rm_nonascii(no_whitespaces)
    spell_corrected = spell_correction(no_nonasci)
    return spell_corrected

**Head of CSV file**
---

In [6]:
data = pd.read_csv('rev.csv')
data.head()

Unnamed: 0,Rev,Auth
0,\nحالا سه هفته است که پخش سریال هفت از شبکه‌ی...,Farid Matin
1,در درون پوسته... نیز با پلان‌هایی بلند مواجه‌ا...,Farid Matin
2,می‌توان به‌جرئت گفت که کاراکتری ساخته و پرداخت...,Farid Matin
3,پدرِ روزبه درگیر آلزایمر است و نه‌تنها درست نم...,Farid Matin
4,دوربین دیگر فقط یک شی‌ء گران‌قیمتِ معمولی نیس...,Farid Matin


**Aplly preprocess function on texts**
---

In [7]:
data['Rev'] = data['Rev'].progress_apply(clean_pipeline)

100%|██████████| 300/300 [00:00<00:00, 364.54it/s]


**Normalizer function in hazm**
---

In [8]:
normalizer = Normalizer()

**Make stopwords**
---

In [10]:
with open('stopwords.txt', encoding="utf8") as stopwords_file:
    stopwords = stopwords_file.readlines()
stopwords = [line.replace('\n', '') for line in stopwords]

In [11]:
stopwords[20:25]

['آخر', 'آخرها', 'آخه', 'آدمهاست', 'آرام']

In [12]:
len(stopwords)

1316

**Stemmer on texts**
---

In [13]:
stemmer = Stemmer()
stemmer.stem('کتاب‌ها')

'کتاب'

**Second preprocess function which do normalization, stemming and tokenizing**
---

In [14]:
def preprocess2(text):
    text_token = word_tokenize(text)
    text_f = [w for w in text_token if not w in stopwords]
    text_stemmed = [stemmer.stem(w) for w in text_f]

    return ' '.join(text_stemmed).replace('\r', '')

In [15]:
def remove_point(text):
    text = normalizer.normalize(text)
    text = text.replace('\u200c', '')
    return text.replace('•', '')

**Apply other preprocessing function**
---

In [16]:
data['Rev'] = data['Rev'].apply(lambda x:remove_point(x))
data['Rev'] = data['Rev'].apply(lambda x:preprocess2(x))

In [17]:
data.head()

Unnamed: 0,Rev,Auth
0,سه هفته پخ سریال شبکه نمایش تماشاخونه آغاز شده...,Farid Matin
1,پوسته پلان بلند مواج فا تیین عجل کاتزدن بهنوع ...,Farid Matin
2,میتو بهجرئ کاراکتر پرداخته نمیشود تماشا سه قسم...,Farid Matin
3,پدر روزبه درگیر آلزایمر نهتن نمیشنود بهیاد نمی...,Farid Matin
4,دوربین شیء گرانقیم وسیل میتواند هو فریز آینده ...,Farid Matin


**Replacing label with ID in CSV file**
---

In [18]:
classes = {'Ashtari':1, 'Behzad Bahramijoo':2, 'Dehghan':3, 'Elham Hesaraki':4,
       'Farid Matin':5, 'Mohammad Dehghani':6, 'Reza Hajmohammadi':7,
       'Saber Rastikerdar':8, 'Sheikhi':9, 'Zahedi':10}

In [19]:
data['Auth'] = data['Auth'].replace(classes)

In [20]:
data['Auth'].value_counts()

5     30
8     30
7     30
4     30
10    30
3     30
9     30
6     30
2     30
1     30
Name: Auth, dtype: int64

**Loading pre-trained model**
---
Here we are using HooshvareLab/bert-base-parsbert-uncased model.

In [44]:
model_name = 'HooshvareLab/bert-base-parsbert-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

**Creating tokenizer and config for the BERT model**
---

In [24]:
id2label = {v:k for k, v in classes.items()}
label2id = classes

In [25]:
tokenizer = BertTokenizer.from_pretrained(model_name)
config = BertConfig.from_pretrained(
    model_name, **{
        'label2id': label2id,
        'id2label': id2label,
    })

print(config.to_json_string())

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "1": "Ashtari",
    "2": "Behzad Bahramijoo",
    "3": "Dehghan",
    "4": "Elham Hesaraki",
    "5": "Farid Matin",
    "6": "Mohammad Dehghani",
    "7": "Reza Hajmohammadi",
    "8": "Saber Rastikerdar",
    "9": "Sheikhi",
    "10": "Zahedi"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Ashtari": 1,
    "Behzad Bahramijoo": 2,
    "Dehghan": 3,
    "Elham Hesaraki": 4,
    "Farid Matin": 5,
    "Mohammad Dehghani": 6,
    "Reza Hajmohammadi": 7,
    "Saber Rastikerdar": 8,
    "Sheikhi": 9,
    "Zahedi": 10
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_versio

**Make train and valid dataset using for KFold**
---

In [27]:
class InputExample:
    """ A single example for simple sequence classification. """

    def __init__(self, guid, text_a, text_b=None, label=None):
        """ Constructs a InputExample. """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

In [28]:
def make_examples(tokenizer, x, y=None, maxlen=128, output_mode="classification", is_tf_dataset=True):
    examples = []
    y = y if isinstance(y, list) or isinstance(y, np.ndarray) else [None] * len(x)

    for i, (_x, _y) in tqdm(enumerate(zip(x, y)), position=0, total=len(x)):
        guid = "%s" % i
        label = int(_y)

        if isinstance(_x, str):
            text_a = _x
            text_b = None
        else:
            assert len(_x) == 2
            text_a = _x[0]
            text_b = _x[1]

        examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))

    features = glue_convert_examples_to_features(
        examples,
        tokenizer,
        maxlen,
        output_mode=output_mode,
        label_list=list(np.unique(y)))

    all_input_ids = []
    all_attention_masks = []
    all_token_type_ids = []
    all_labels = []

    for f in tqdm(features, position=0, total=len(examples)):
        if is_tf_dataset:
            all_input_ids.append(tf.constant(f.input_ids))
            all_attention_masks.append(tf.constant(f.attention_mask))
            all_token_type_ids.append(tf.constant(f.token_type_ids))
            all_labels.append(tf.constant(f.label))
        else:
            all_input_ids.append(f.input_ids)
            all_attention_masks.append(f.attention_mask)
            all_token_type_ids.append(f.token_type_ids)
            all_labels.append(f.label)

    if is_tf_dataset:
        dataset = tf.data.Dataset.from_tensor_slices(({
            'input_ids': all_input_ids,
            'attention_mask': all_attention_masks,
            'token_type_ids': all_token_type_ids
        }, all_labels))

        return dataset, features

    xdata = [np.array(all_input_ids), np.array(all_attention_masks), np.array(all_token_type_ids)]
    ydata = all_labels

    return [xdata, ydata], features

In [32]:
def get_training_dataset(dataset, batch_size):
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(batch_size)

    return dataset

def get_validation_dataset(dataset, batch_size):
    dataset = dataset.batch(batch_size)

    return dataset

**Training phase CONFIG**
---

In [52]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4

EPOCHS = 5
LEARNING_RATE = 2e-5

**Function for building model and optimizer**
---

In [48]:
def build_model(model_name, config, learning_rate=3e-5):
    model = TFBertForSequenceClassification.from_pretrained(model_name, config=config)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    return model

In [49]:
model = build_model(model_name, config, learning_rate=LEARNING_RATE)

tf_model.h5:   0%|          | 0.00/963M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [55]:
labels = list(sorted(df['Auth'].unique()))
labels

['Ashtari',
 'Behzad Bahramijoo',
 'Dehghan',
 'Elham Hesaraki',
 'Farid Matin',
 'Mohammad Dehghani',
 'Reza Hajmohammadi',
 'Saber Rastikerdar',
 'Sheikhi',
 'Zahedi']

**Cross Fold calidation**
---
Here we are using Cross Fold validation with 5 splits.
The functions takes two inputs, first a dataset which is a list of tuples, an author_index, in this case the author's name, and the text document. In addition it get the number of epochs for each fold which is equal to 3.
Next, we devide data into X and y which represent the text documnts and their author_indexes respectively.
Then the folding begins. For each Fold, a new BERT model is created. Then the model is used to predict the results for that fold and finally, it gets validated.
In every turn the accuracy is saved in the accuracies list.

In [71]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
X, y = data['Rev'].values.tolist(), data['Auth'].values.tolist()
for i, (train_index, val_index) in enumerate(kf.split(X)):
    X_train_fold, X_val_fold = [X[i] for i in train_index], [X[i] for i in val_index]
    y_train_fold, y_val_fold = [y[i] for i in train_index], [y[i] for i in val_index]

    train_dataset_base, train_examples = make_examples(tokenizer, X_train_fold, y_train_fold, maxlen=128)
    valid_dataset_base, valid_examples = make_examples(tokenizer, X_val_fold, y_val_fold, maxlen=128)

    train_dataset_fold = get_training_dataset(train_dataset_base, TRAIN_BATCH_SIZE)
    valid_dataset_fold = get_training_dataset(valid_dataset_base, VALID_BATCH_SIZE)

    train_steps = len(train_examples) // TRAIN_BATCH_SIZE
    valid_steps = len(valid_examples) // VALID_BATCH_SIZE

    model = build_model(model_name, config, learning_rate=LEARNING_RATE)

    r = model.fit(
      train_dataset_fold,
      validation_data=valid_dataset_fold,
      steps_per_epoch=train_steps,
      validation_steps=valid_steps,
      epochs=EPOCHS,
      verbose=1)

    [xtest, ytest], test_examples = make_examples(tokenizer, X_val_fold, y_val_fold, maxlen=128, is_tf_dataset=False)
    predictions = model.predict(xtest)
    ypred = predictions[0].argmax(axis=-1).tolist()

    print()
    print("============Fold"+str(i+1)+"============")
    print(classification_report(ytest, ypred, target_names=labels))
    print()
    print(f'F1: {f1_score(ytest, ypred, average="weighted")}')

100%|██████████| 240/240 [00:00<00:00, 37941.76it/s]
100%|██████████| 240/240 [00:00<00:00, 6580.33it/s]
100%|██████████| 60/60 [00:00<00:00, 83746.50it/s]
100%|██████████| 60/60 [00:00<00:00, 5271.65it/s]
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


100%|██████████| 60/60 [00:00<00:00, 73433.98it/s]
100%|██████████| 60/60 [00:00<00:00, 335097.52it/s]



                   precision    recall  f1-score   support

          Ashtari       1.00      1.00      1.00         6
Behzad Bahramijoo       1.00      0.67      0.80         3
          Dehghan       1.00      1.00      1.00         5
   Elham Hesaraki       1.00      0.67      0.80         9
      Farid Matin       1.00      0.67      0.80         6
Mohammad Dehghani       0.85      0.92      0.88        12
Reza Hajmohammadi       0.70      1.00      0.82         7
Saber Rastikerdar       0.83      1.00      0.91         5
          Sheikhi       0.80      0.80      0.80         5
           Zahedi       0.67      1.00      0.80         2

         accuracy                           0.87        60
        macro avg       0.88      0.87      0.86        60
     weighted avg       0.89      0.87      0.86        60


F1: 0.8645026737967914


100%|██████████| 240/240 [00:00<00:00, 187734.61it/s]
100%|██████████| 240/240 [00:00<00:00, 9340.74it/s]
100%|██████████| 60/60 [00:00<00:00, 238538.62it/s]
100%|██████████| 60/60 [00:00<00:00, 5315.30it/s]
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


100%|██████████| 60/60 [00:00<00:00, 37288.23it/s]
100%|██████████| 60/60 [00:00<00:00, 338705.57it/s]



                   precision    recall  f1-score   support

          Ashtari       1.00      1.00      1.00         4
Behzad Bahramijoo       1.00      1.00      1.00         5
          Dehghan       1.00      1.00      1.00         4
   Elham Hesaraki       1.00      0.83      0.91         6
      Farid Matin       0.70      1.00      0.82         7
Mohammad Dehghani       0.83      1.00      0.91         5
Reza Hajmohammadi       1.00      0.67      0.80         9
Saber Rastikerdar       1.00      1.00      1.00         5
          Sheikhi       1.00      0.88      0.93         8
           Zahedi       0.88      1.00      0.93         7

         accuracy                           0.92        60
        macro avg       0.94      0.94      0.93        60
     weighted avg       0.94      0.92      0.92        60


F1: 0.916078431372549


100%|██████████| 240/240 [00:00<00:00, 305410.49it/s]
100%|██████████| 240/240 [00:00<00:00, 9007.01it/s]
100%|██████████| 60/60 [00:00<00:00, 85278.97it/s]
100%|██████████| 60/60 [00:00<00:00, 10830.07it/s]
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


100%|██████████| 60/60 [00:00<00:00, 42245.80it/s]
100%|██████████| 60/60 [00:00<00:00, 350498.94it/s]



                   precision    recall  f1-score   support

          Ashtari       0.83      0.83      0.83         6
Behzad Bahramijoo       1.00      0.86      0.92         7
          Dehghan       0.90      0.90      0.90        10
   Elham Hesaraki       1.00      1.00      1.00         5
      Farid Matin       0.83      1.00      0.91         5
Mohammad Dehghani       1.00      1.00      1.00         4
Reza Hajmohammadi       1.00      0.80      0.89         5
Saber Rastikerdar       1.00      1.00      1.00         5
          Sheikhi       1.00      1.00      1.00         4
           Zahedi       0.90      1.00      0.95         9

         accuracy                           0.93        60
        macro avg       0.95      0.94      0.94        60
     weighted avg       0.94      0.93      0.93        60


F1: 0.9329625540151856


100%|██████████| 240/240 [00:00<00:00, 195273.13it/s]
100%|██████████| 240/240 [00:00<00:00, 9085.71it/s]
100%|██████████| 60/60 [00:00<00:00, 237862.23it/s]
100%|██████████| 60/60 [00:00<00:00, 6332.62it/s]
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


100%|██████████| 60/60 [00:00<00:00, 178228.22it/s]
100%|██████████| 60/60 [00:00<00:00, 19703.90it/s]



                   precision    recall  f1-score   support

          Ashtari       1.00      0.71      0.83         7
Behzad Bahramijoo       1.00      0.90      0.95        10
          Dehghan       0.71      1.00      0.83         5
   Elham Hesaraki       0.80      1.00      0.89         4
      Farid Matin       1.00      1.00      1.00         7
Mohammad Dehghani       0.80      0.80      0.80         5
Reza Hajmohammadi       1.00      1.00      1.00         5
Saber Rastikerdar       1.00      1.00      1.00         6
          Sheikhi       0.83      0.83      0.83         6
           Zahedi       1.00      1.00      1.00         5

         accuracy                           0.92        60
        macro avg       0.91      0.92      0.91        60
     weighted avg       0.93      0.92      0.92        60


F1: 0.9171539961013645


100%|██████████| 240/240 [00:00<00:00, 46250.08it/s]
100%|██████████| 240/240 [00:00<00:00, 9452.93it/s]
100%|██████████| 60/60 [00:00<00:00, 252668.92it/s]
100%|██████████| 60/60 [00:00<00:00, 6934.07it/s]
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


100%|██████████| 60/60 [00:00<00:00, 193137.56it/s]
100%|██████████| 60/60 [00:00<00:00, 21011.79it/s]



                   precision    recall  f1-score   support

          Ashtari       0.70      1.00      0.82         7
Behzad Bahramijoo       1.00      1.00      1.00         5
          Dehghan       1.00      0.50      0.67         6
   Elham Hesaraki       1.00      1.00      1.00         6
      Farid Matin       1.00      1.00      1.00         5
Mohammad Dehghani       1.00      1.00      1.00         4
Reza Hajmohammadi       1.00      1.00      1.00         4
Saber Rastikerdar       1.00      1.00      1.00         9
          Sheikhi       1.00      1.00      1.00         7
           Zahedi       1.00      1.00      1.00         7

         accuracy                           0.95        60
        macro avg       0.97      0.95      0.95        60
     weighted avg       0.96      0.95      0.95        60


F1: 0.946078431372549
