In [64]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '20'

In [65]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras import layers
from keras.layers import Dense, GlobalAveragePooling1D, Embedding, Dropout
from keras.models import Sequential
from keras.losses import CategoricalCrossentropy, CategoricalFocalCrossentropy
from keras.metrics import Precision, Recall, CategoricalAccuracy, F1Score, TruePositives, TrueNegatives, FalseNegatives, FalsePositives
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences

In [66]:
url = "https://github.com/Alireza-Akhavan/text-classification/raw/main/6-persian-topics.zip"
dataset = tf.keras.utils.get_file("6-persian-topics.zip", url,
                                    extract=True, cache_dir='.',
                                    cache_subdir='')
dataset_dir = os.path.join(os.path.dirname(dataset), '6-persian-topics')


In [67]:
ry = []
ryazi = os.path.join(dataset_dir, 'ریاضیات/')
for r in os.listdir(ryazi):
    ry.append(os.path.join(dataset_dir, 'ریاضیات/', r))
beh = []
behdasht = os.path.join(dataset_dir, 'بهداشت و سلامت/')
for r in os.listdir(behdasht):
    beh.append(os.path.join(dataset_dir, 'بهداشت و سلامت/', r))
jog = []
joghrafia = os.path.join(dataset_dir, 'جغرافیا و مکانها/')
for r in os.listdir(joghrafia):
    jog.append(os.path.join(dataset_dir, 'جغرافیا و مکانها/', r))
fan = []
fanavari = os.path.join(dataset_dir, 'فناوری و علوم کاربردی و تکنولوژی/')
for r in os.listdir(fanavari):
    fan.append(os.path.join(dataset_dir, 'فناوری و علوم کاربردی و تکنولوژی/', r))
var = []
varzesh = os.path.join(dataset_dir, 'ورزش/')
for r in os.listdir(varzesh):
    var.append(os.path.join(dataset_dir, 'ورزش/', r))
di = []
din = os.path.join(dataset_dir, 'دین و اعتقاد/')
for r in os.listdir(din):
    di.append(os.path.join(dataset_dir, 'دین و اعتقاد/', r))
ry.extend(beh)
ry.extend(jog)
ry.extend(fan)
ry.extend(var)
ry.extend(di)

for i in ry:
    file = open(i, 'r')
    content = file.read()
    if len(content.split(' ')) <= 5:
        os.remove(i)
    file.close()

In [68]:
classes = os.listdir(dataset_dir)
dataset = []
labels = []
for c in classes:
    files = os.listdir(os.path.join(dataset_dir, c))
    for f in files:
        file_path = os.path.join(os.path.join(dataset_dir, c), f)
        dataset.append(file_path)
        labels.append(classes.index(c))


train_data, valid_data, train_target, valid_target = train_test_split(dataset, labels, test_size=0.2)



In [69]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    stripped_ye = tf.strings.regex_replace(stripped_html, 'ي', 'ی')
    stripped_camma = tf.strings.regex_replace(stripped_ye, '،', ' ')
    stripped_colon = tf.strings.regex_replace(stripped_camma, ':', ' ')
    stripped_he = tf.strings.regex_replace(stripped_colon, 'هٔ', 'ه')
    stripped_ke = tf.strings.regex_replace(stripped_he, 'ك', 'ک')
    stripped_alef = tf.strings.regex_replace(stripped_ke, 'آ', 'ا')
    stripped_english = tf.strings.regex_replace(stripped_alef, '[a-zA-Z]', ' ')
    return tf.strings.regex_replace(stripped_english, '[%s]' % re.escape(string.punctuation), '')


In [88]:
max_feature = 20000
vectorize_layer = layers.TextVectorization(
    max_tokens=max_feature,
    standardize=custom_standardization,
    output_sequence_length=250)

In [89]:
import re
import string

AUTOTUNE = tf.data.experimental.AUTOTUNE
BUFFER_SIZE = 10000
BATCH_SIZE = 64

def get_text_with_target(path, target):
    target = tf.one_hot(target, depth=6, dtype='float32')
    content = tf.io.read_file(path)
    return content, target

def vectorize_text(text, target):
    text = vectorize_layer(text)
    return text, target

train_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_target))
validation_dataset = tf.data.Dataset.from_tensor_slices((valid_data, valid_target))

train_dataset = train_dataset.map(get_text_with_target, num_parallel_calls=AUTOTUNE)
vectorize_layer.adapt(train_dataset.map(lambda t, l: t, num_parallel_calls=AUTOTUNE))
train_dataset = train_dataset.map(vectorize_text, num_parallel_calls=AUTOTUNE)

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, num_parallel_calls=AUTOTUNE, drop_remainder=True).prefetch(AUTOTUNE)
validation_dataset = validation_dataset.map(get_text_with_target, num_parallel_calls=AUTOTUNE).map(vectorize_text, num_parallel_calls=AUTOTUNE).cache().batch(BATCH_SIZE, num_parallel_calls=AUTOTUNE).prefetch(AUTOTUNE)

In [90]:
model = tf.keras.Sequential([
    layers.Embedding(input_dim=max_feature,
                     output_dim=250),
    layers.Bidirectional(layers.LSTM(128)),
    layers.Dense(128, activation='relu'),
    layers.Dense(len(classes), activation='softmax')
])

In [91]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 250)         5000000   
                                                                 
 bidirectional_3 (Bidirecti  (None, 256)               388096    
 onal)                                                           
                                                                 
 dense_8 (Dense)             (None, 128)               32896     
                                                                 
 dense_9 (Dense)             (None, 6)                 774       
                                                                 
Total params: 5421766 (20.68 MB)
Trainable params: 5421766 (20.68 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [92]:
model.compile(loss=CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
              metrics=[
                  tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
                  tf.keras.metrics.Precision(name='precision'),
                  tf.keras.metrics.Recall(name='recall'),
                  tf.keras.metrics.F1Score(name='f1-score')
              ])

In [93]:
train_iterator = train_dataset.as_numpy_iterator()

class_counts = {}
class_indices = {}

class_index = 0
for data, labels in train_iterator:
    for label in labels:
        label_tuple = tuple(label)
        if label_tuple not in class_counts:
            class_counts[label_tuple] = 1
            class_indices[label_tuple] = class_index
            class_index += 1
        else:
            class_counts[label_tuple] += 1

total_samples = sum(class_counts.values())

class_weights = {
    class_indices[label]: total_samples / count
    for label, count in class_counts.items()
}

for label, count in class_counts.items():
    print(f"Class {label}: {count} samples")

for class_index, weight in class_weights.items():
    print(f"Class {class_index} weight: {weight}")

Class (0.0, 0.0, 0.0, 1.0, 0.0, 0.0): 4993 samples
Class (0.0, 1.0, 0.0, 0.0, 0.0, 0.0): 17337 samples
Class (0.0, 0.0, 0.0, 0.0, 1.0, 0.0): 1527 samples
Class (0.0, 0.0, 1.0, 0.0, 0.0, 0.0): 5189 samples
Class (1.0, 0.0, 0.0, 0.0, 0.0, 0.0): 304 samples
Class (0.0, 0.0, 0.0, 0.0, 0.0, 1.0): 410 samples
Class 0 weight: 5.960344482275185
Class 1 weight: 1.7165599584703235
Class 2 weight: 19.48919449901768
Class 3 weight: 5.735209096164964
Class 4 weight: 97.89473684210526
Class 5 weight: 72.58536585365853


In [94]:
history = model.fit(train_dataset, epochs=20, validation_data=validation_dataset, class_weight=class_weights)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [95]:
predictions = model.predict(validation_dataset) 
predicted_classes = tf.math.argmax(predictions, axis=-1)
from sklearn.metrics import classification_report 
report = classification_report(valid_target, predicted_classes) 

print(report)

              precision    recall  f1-score   support

           0       0.93      0.56      0.70        72
           1       0.99      0.96      0.98      4382
           2       0.91      0.95      0.93      1243
           3       0.97      0.97      0.97      1269
           4       0.90      0.94      0.92       367
           5       0.48      0.85      0.61       108

    accuracy                           0.96      7441
   macro avg       0.86      0.87      0.85      7441
weighted avg       0.96      0.96      0.96      7441

