# Model

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.model_selection import train_test_split

<h3 style= "color:blue;"> Importing Data Set </h3>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# load your dataframe
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Research/Method 2 - Titles/preprocessed_title.csv",index_col=[0])

In [4]:
# change both types into string
df['Title'] = df['Title'].astype('str')

In [10]:
texts = df['Title'].values
labels = df['classes'].values

# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.3, random_state=42)

In [11]:
# Initialize the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Pad sequences
max_length = 300  # maximum sequence length
train_data = pad_sequences(train_sequences, maxlen=max_length)
test_data = pad_sequences(test_sequences, maxlen=max_length)

In [12]:
vocab_size = len(tokenizer.word_index) + 1  # vocabulary size

model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(12, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
epochs = 10
batch_size = 64

model.fit(train_data, train_labels, epochs=epochs, batch_size=batch_size, validation_data=(test_data, test_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd7177d6aa0>

In [14]:
loss, accuracy = model.evaluate(test_data, test_labels)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy*100:.2f}%")

Test Loss: 2.1062
Test Accuracy: 27.69%


In [15]:
from sklearn.metrics import classification_report

# Assuming you have already trained your model and obtained predictions
predictions = model.predict(test_data)
predicted_labels = np.argmax(predictions, axis=1)

# Generate classification report
report = classification_report(test_labels, predicted_labels)

print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.00      0.00      0.00        12
           2       0.00      0.00      0.00         7
           3       0.18      0.07      0.11        27
           4       0.40      0.85      0.54        62
           5       0.17      0.07      0.10        29
           6       0.00      0.00      0.00        24
           7       0.00      0.00      0.00        13
           8       0.00      0.00      0.00         8
           9       0.00      0.00      0.00        16
          10       0.12      0.40      0.19        25
          11       0.00      0.00      0.00         9

    accuracy                           0.28       242
   macro avg       0.07      0.12      0.08       242
weighted avg       0.15      0.28      0.18       242



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
