<a href="https://colab.research.google.com/github/ParsaHejabi/ComputationalIntelligence-ComputerAssignments/blob/main/FinalProject/CI_FinalProject_BiLSTM_MoreLayers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import dataset from Google Drive to Colab

In [None]:
!rm cleaned_train.csv
!rm cleaned_test.csv

!cp drive/MyDrive/CI_FinalProject/cleaned_train.csv ./
!cp drive/MyDrive/CI_FinalProject/cleaned_test.csv ./

# Import all important things

In [None]:
import pandas as pd
import numpy as np

import tensorflow as tf

import matplotlib.pyplot as plt

# Preprocessing

## Load cleaned train data

In [None]:
train_data = pd.read_csv('cleaned_train.csv', usecols=['clean_text', 'Category'])
train_data['category_id'] = train_data['Category'].factorize()[0]
train_data.head()

Unnamed: 0,Category,clean_text,category_id
0,Science and Culture,خبرنامه دانشگاه علم و صنعت ایران شماره یازدهم ...,0
1,Sport,تا پایان سال ۱۳۷۸ دهها زمین فوتبال و سالن ورزش...,1
2,Economy,انجمن تولیدکنندگان تجهیزات صنعت نفت تشکیل شد ن...,2
3,Miscellaneous.World News,کرتین برای سومین بار نخست وزیر کانادا شد ژان ک...,3
4,Sport,خداحافظ رفقا نمایندگان اروپای شرقی در جام ۲۰۰۲...,1


In [None]:
test_data = pd.read_csv('cleaned_test.csv', usecols=['Id', 'clean_text'])
test_data.head()

Unnamed: 0,Id,clean_text
0,0,هفت اقلیم آلودگی هوا پکن را تهدید میکند باافزا...
1,1,گل و گیاه زعفران زینتی نام علمی: crocus banati...
2,2,یادداشت قانون بودجه و صنایع کوچک در شماره گذشت...
3,3,در سالروز میلاد حضرت مهدی همایش ادبی دانش آموز...
4,4,از ira تا فارک بوگوتا، پایتخت پرهرج ومرج کلمبی...


In [None]:
label2id = {label: i for i, label in enumerate(train_data['Category'].unique())}
id2label = {v: k for k, v in label2id.items()}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

label2id: {'Science and Culture': 0, 'Sport': 1, 'Economy': 2, 'Miscellaneous.World News': 3, 'Miscellaneous.Urban': 4, 'Social.Women': 5, 'Social': 6, 'Literature and Art': 7, 'Politics': 8, 'Miscellaneous': 9, 'Economy.Bank and Bourse': 10, 'Politics.Iran Politics': 11, 'Tourism': 12, 'Social.Religion': 13, 'Miscellaneous.Picture': 14, 'Miscellaneous.Happenings': 15, 'Science and Culture.Science.Book': 16, 'Literature and Art.Art': 17, 'Miscellaneous.Islamic Councils': 18, 'Literature and Art.Art.Cinema': 19, 'Science and Culture.Science.Information and Communication Technology': 20, 'Economy.Oil': 21, 'Economy.Commerce': 22, 'Natural Environment': 23, 'Science and Culture.Science': 24, 'Economy.Industry': 25, 'Economy.Agriculture': 26, 'Sport.World Cup': 27, 'Miscellaneous.Picture.Caricature': 28, 'Literature and Art.Art.Music': 29, 'Literature and Art.Art.Theater': 30, 'Economy.Dwelling and Construction': 31, 'Science and Culture.Science.Medicine and Remedy': 32, 'Literature and Ar

In [None]:
train_data_text = train_data.pop('clean_text')
train_data_label = train_data.pop('category_id')

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_data_text.values, train_data_label.values))
test_dataset = tf.data.Dataset.from_tensor_slices(test_data['clean_text'].values)
train_dataset.element_spec

(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [None]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

texts:  [b'\xd8\xac\xd8\xa7\xdb\x8c\xd8\xb2\xd9\x87 \xd8\xac\xd9\x87\xd8\xa7\xd9\x86\xdb\x8c \xd9\x85\xd8\xad\xdb\x8c\xd8\xb7 \xd8\xb2\xdb\x8c\xd8\xb3\xd8\xaa \xd8\xa8\xd8\xb1\xd8\xa7\xdb\x8c \xdb\x8c\xda\xa9 \xd8\xb9\xda\xa9\xd8\xa7\xd8\xb3 \xd8\xa7\xdb\x8c\xd8\xb1\xd8\xa7\xd9\x86\xdb\x8c \xd9\x86\xd8\xb4\xd8\xa7\xd9\x86 \xd8\xa7\xd9\x81\xd8\xaa\xd8\xae\xd8\xa7\xd8\xb1 \xd9\x85\xd8\xb3\xd8\xa7\xd8\xa8\xd9\x82\xd9\x87 \xd8\xac\xd9\x87\xd8\xa7\xd9\x86\xdb\x8c \xd9\x85\xd8\xad\xdb\x8c\xd8\xb7 \xd8\xb2\xdb\x8c\xd8\xb3\xd8\xaa \xda\x98\xd8\xa7\xd9\xbe\xd9\x86 \xd8\xa8\xd9\x87 \xd8\xb9\xd9\x84\xdb\x8c \xd8\xa2\xd9\x84 \xd8\xad\xd8\xb3\xdb\x8c\xd9\x86\xdb\x8c \xd8\xb9\xda\xa9\xd8\xa7\xd8\xb3 \xd8\xa7\xdb\x8c\xd8\xb1\xd8\xa7\xd9\x86\xdb\x8c \xd8\xaa\xd8\xb9\xd9\x84\xd9\x82 \xda\xaf\xd8\xb1\xd9\x81\xd8\xaa. \xd8\xac\xd9\x88\xd8\xa7\xdb\x8c\xd8\xb2 \xd8\xa8\xd8\xb1\xda\xaf\xd8\xb2\xdb\x8c\xd8\xaf\xda\xaf\xd8\xa7\xd9\x86 \xd8\xa8\xd8\xae\xd8\xb4\xd9\x87\xd8\xa7\xdb\x8c \xd9\x85\xd8\xae\xd8\xaa\x

In [None]:
VOCAB_SIZE=1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [None]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'و', 'در', 'به', 'از', 'که', 'این', 'را', 'است', 'با',
       'برای', 'آن', 'یک', 'خود', 'شده', 'کرد', 'شد', 'کشور', 'تا'],
      dtype='<U9')

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(34, activation='softmax')
])

In [None]:
# predict on a sample text without padding.

padding = "ورزش " * 2000
sample_text = ('آمریکا خاک عراق را بمباران کرد.')
predictions = model.predict(np.array([sample_text, padding]))
print(predictions[0])

[0.02952371 0.02932904 0.0292026  0.02926029 0.02941853 0.0296001
 0.02922832 0.02943034 0.0292951  0.02943096 0.029283   0.02935884
 0.02947133 0.02931112 0.02919441 0.02941479 0.02931316 0.02946259
 0.029453   0.02938823 0.02947907 0.02959768 0.02937441 0.02961947
 0.0292345  0.02961018 0.02952806 0.02958603 0.02936085 0.02936303
 0.02947244 0.02944868 0.02951333 0.0294428 ]


In [None]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history = model.fit(train_dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
predictions = model.predict(np.array(test_data['clean_text']))

In [None]:
predictions[0]

In [None]:
print(tf.argmax(predictions, 1).numpy())

[4 9 2 ... 3 9 2]


In [None]:
id2label[tf.argmax(predictions, 1)[0].numpy()]

'Miscellaneous.Urban'

In [None]:
y_pred_name = []
for cat_id in tf.argmax(predictions, 1).numpy():
    y_pred_name.append(id2label[cat_id])

In [None]:
submission = pd.DataFrame({
        "Id": test_data["Id"],
        "Category": y_pred_name
    })

In [None]:
submission.head()

Unnamed: 0,Id,Category
0,0,Miscellaneous.Urban
1,1,Miscellaneous
2,2,Economy
3,3,Science and Culture
4,4,Miscellaneous.World News


In [None]:
submission.to_csv('submission.csv', index=False)