# Model

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_datasets as tfds
from keras.utils import to_categorical
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.metrics import classification_report

<h3 style= "color:blue;"> Importing Data Set </h3>

In [4]:
# load your dataframe
df = pd.read_csv("Algorithms/Preprocessed.csv",index_col=[0])

In [5]:
# change both types into string
df['content'] = df['content'].astype('str')
df['fileclass'] = df['fileclass'].astype('str')

In [6]:
# remove duplicates, leaving the first record
df = df.drop_duplicates(subset=['content'],keep="first")

<b> Before moving ahead let's convert the fileclass into integer format, as it will make our work easy!!</b>

In [9]:
# Convert filecalss into int values
classes={'ChildProtection':0,'Cybersecurity':1,'DataPrivacy':2,'DataSystemsDevelopment':3,
         'DigitalFinance':4,'DigitalInclusion':5,'DigitalInformatioServices':6,
         'DigitalInfrastructure':7,'DigitalLiteracy':8,'DigitalServices':9,'Egovernment':10,'Upskilling':11
         }
# map the filecalss values
df['classes']=df['fileclass'].map(classes)

# drop the fileclass axis
df.drop(['fileclass'], axis=1,inplace=True)

In [10]:
# Assuming you have your data in a DataFrame named `df`
train_data = df['content'].tolist()
train_labels = df['classes'].tolist()

# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(
    train_data, train_labels, test_size=0.3, random_state=42
)

In [11]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode the training data
train_encodings = tokenizer(train_data, truncation=True, padding=True, max_length=300, return_tensors='tf')

# Tokenize and encode the testing data
test_encodings = tokenizer(test_data, truncation=True, padding=True, max_length=300, return_tensors='tf')

In [13]:
# Assuming you have your data in a DataFrame named `df`
train_data = df['content'].tolist()
train_labels = df['classes'].tolist()

# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(
    train_data, train_labels, test_size=0.2, random_state=42
)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode the training data
train_encodings = tokenizer(train_data, truncation=True, padding=True, max_length=100, return_tensors='tf')

# Tokenize and encode the testing data
test_encodings = tokenizer(test_data, truncation=True, padding=True, max_length=100, return_tensors='tf')

# Convert the labels to TensorFlow tensors
train_labels = tf.convert_to_tensor(train_labels)
test_labels = tf.convert_to_tensor(test_labels)

# Load the pre-trained BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=12)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])




All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  9228      
                                                                 
Total params: 109491468 (417.68 MB)
Trainable params: 109491468 (417.68 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Fine-tune the model on your training data
model.fit(train_encodings['input_ids'], train_labels, epochs=10, batch_size=64)

# Make predictions on the test data
predicted_labels = model.predict(test_encodings['input_ids']).logits.argmax(axis=1)

# Generate classification report
report = classification_report(test_labels, predicted_labels)

print(report)