<a href="https://colab.research.google.com/drive/1Lze6KKZxVwH_NLaU1opYwwEHsEks5EvG#scrollTo=i9AvFl1FmLER" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**BERT BASE MULTILINGUAL CASED**

In our text classification project, we used the BERT (Bidirectional Encoder Representations from Transformers) model, specifically its 'bert-base-multilingual-cased' version, renowned for its advanced natural language processing capabilities. This model processes text by creating rich, contextualized embeddings, capturing the intricate nuances of language. We further enhanced this model by adding a neural network layer, including a dense layer for complexity and a dropout layer to prevent overfitting, culminating in a softmax layer for classifying texts into different difficulty levels.

This setup, combined with TensorFlow's functionalities, leveraged BERT's powerful language understanding with additional neural network layers tailored to our classification needs. The model was optimized using the Adam optimizer and trained in smaller batch sizes to effectively handle the computational demands of processing complex linguistic data.

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import pandas as pd
df_train = pd.read_csv("https://raw.githubusercontent.com/Oglo/Project-DSML/main/Data/training_data.csv").dropna()
df_test = pd.read_csv("https://raw.githubusercontent.com/Oglo/Project-DSML/main/Data/unlabelled_test_data.csv").dropna()
df_final = pd.read_csv("https://raw.githubusercontent.com/Oglo/Project-DSML/main/Data/sample_submission.csv").dropna()

In [None]:
!pip install transformers
import tensorflow as tf
print(tf.__version__)
import pandas as pd
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import numpy as np

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')

def encode_sentences(sentences, tokenizer, max_length=128):
    input_ids = []
    attention_masks = []

    for sentence in sentences:
        encoded = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='tf',
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    return np.array(tf.squeeze(input_ids)), np.array(tf.squeeze(attention_masks))

training_data = df_train
unlabelled_test_data = df_test

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X_train, X_test, y_train, y_test = train_test_split(
    training_data['sentence'],
    training_data['difficulty'],
    test_size=0.2,
    random_state=42
)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

X_train_ids, X_train_masks = encode_sentences(X_train, tokenizer)
X_test_ids, X_test_masks = encode_sentences(X_test, tokenizer)

input_ids = Input(shape=(128,), dtype=tf.int32)
input_mask = Input(shape=(128,), dtype=tf.int32)
embeddings = bert_model(input_ids, attention_mask=input_mask)[0][:, 0, :]
x = Dense(64, activation='relu')(embeddings)
x = Dropout(0.1)(x)
output = Dense(6, activation='softmax')(x)

model = Model(inputs=[input_ids, input_mask], outputs=output)
model.compile(Adam(lr=2e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit([X_train_ids, X_train_masks], y_train_encoded, epochs=3, batch_size=16) 

model.evaluate([X_test_ids, X_test_masks], y_test_encoded)

unlabelled_ids, unlabelled_masks = encode_sentences(unlabelled_test_data['sentence'], tokenizer)
predictions = model.predict([unlabelled_ids, unlabelled_masks])
predicted_levels = label_encoder.inverse_transform(np.argmax(predictions, axis=1)) 

unlabelled_test_data['difficulty'] = predicted_levels