In [1]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import logging
import pickle
import numpy as np



In [2]:
class CustomModel(nn.Module):
    def __init__(self, base_model, num_labels):  # FIX: Correct __init__ method
        super(CustomModel, self).__init__()  # FIX: Correct super() call
        self.base_model = base_model 
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(base_model.config.hidden_size, num_labels)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        return {"loss": loss, "logits": logits}


In [3]:
# Load data
train_path = "/kaggle/input/maqa-dataset/Unbalanced/Unbalanced/MAQA_Train.xlsx"
test_path = "/kaggle/input/maqa-dataset/Unbalanced/Unbalanced/MAQA_Test.xlsx"
train_df = pd.read_excel(train_path)
test_df = pd.read_excel(test_path)
all_data = pd.concat([train_df, test_df], ignore_index=True)
all_data = all_data[['q_body', 'category']]

valid_categories = [
    "ÿßŸÖÿ±ÿßÿ∂ ŸÜÿ≥ÿßÿ¶Ÿäÿ©",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿπÿ∂ŸÑÿßÿ™ ŸàÿßŸÑÿπÿ∏ÿßŸÖ Ÿà ÿßŸÑŸÖŸÅÿßÿµŸÑ",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿ¨Ÿáÿßÿ≤ ÿßŸÑŸáÿ∂ŸÖŸä",
    "ÿßŸÑÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿ¨ŸÜÿ≥Ÿäÿ©",
    "ÿ∑ÿ® ÿßŸÑÿßÿ≥ŸÜÿßŸÜ",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑŸÇŸÑÿ® Ÿà ÿßŸÑÿ¥ÿ±ÿßŸäŸäŸÜ",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿπŸäŸàŸÜ",
    "ÿßŸÜŸÅ ÿßÿ∞ŸÜ Ÿàÿ≠ŸÜÿ¨ÿ±ÿ©",
    "ÿ¨ÿ±ÿßÿ≠ÿ© ÿ™ÿ¨ŸÖŸäŸÑ",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿØŸÖ"
]

all_data = all_data[all_data["category"].isin(valid_categories)]
all_data = all_data.sample(frac=1, random_state=42).reset_index(drop=True)
train_df, test_df = train_test_split(all_data, test_size=0.2, random_state=42, stratify=all_data['category'])

In [4]:
train_df["text"] = train_df["q_body"]
test_df["text"] = test_df["q_body"]

category_mapping = {cat: i for i, cat in enumerate(valid_categories)}
train_df['label'] = train_df['category'].map(category_mapping)
test_df['label'] = test_df['category'].map(category_mapping)

model_name = "bert-base-multilingual-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = Dataset.from_dict({"text": train_df["text"].tolist(), "label": train_df["label"].tolist()})
test_dataset = Dataset.from_dict({"text": test_df["text"].tolist(), "label": test_df["label"].tolist()})

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/159143 [00:00<?, ? examples/s]

Map:   0%|          | 0/39786 [00:00<?, ? examples/s]

In [6]:
base_model = AutoModel.from_pretrained(model_name)
model = CustomModel(base_model, 10)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    per_device_train_batch_size=96,
    per_device_eval_batch_size=96,
    num_train_epochs=10,
    weight_decay=0.01,
    learning_rate=3e-5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

  trainer = Trainer(


In [8]:
# Train
print("\nStarting training...")
trainer.train()



Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3136,0.305495,0.908234,0.908522,0.908234,0.907673
2,0.2568,0.271403,0.916101,0.915336,0.916101,0.915218
3,0.2174,0.262756,0.920047,0.919482,0.920047,0.919249
4,0.1822,0.260527,0.921304,0.921504,0.921304,0.921325
5,0.1494,0.271456,0.923968,0.923638,0.923968,0.923421
6,0.1008,0.293499,0.925476,0.924774,0.925476,0.924706
7,0.0776,0.31536,0.925652,0.925248,0.925652,0.925315
8,0.0769,0.336829,0.925401,0.925098,0.925401,0.925149
9,0.0429,0.365252,0.926783,0.926325,0.926783,0.926421
10,0.0245,0.377646,0.926431,0.925893,0.926431,0.926096


TrainOutput(global_step=16580, training_loss=0.15933053550210446, metrics={'train_runtime': 19441.6522, 'train_samples_per_second': 81.857, 'train_steps_per_second': 0.853, 'total_flos': 0.0, 'train_loss': 0.15933053550210446, 'epoch': 10.0})

In [9]:
# Evaluate
train_metrics = trainer.evaluate(train_dataset)
print("\Train Metrics:", train_metrics)

\Train Metrics: {'eval_loss': 0.016327695921063423, 'eval_accuracy': 0.9961041327611017, 'eval_precision': 0.9961060475960296, 'eval_recall': 0.9961041327611017, 'eval_f1': 0.9961030414344039, 'eval_runtime': 587.4023, 'eval_samples_per_second': 270.927, 'eval_steps_per_second': 2.823, 'epoch': 10.0}


In [10]:
test_metrics = trainer.evaluate(test_dataset)

print("\nTest Metrics:", test_metrics)


Test Metrics: {'eval_loss': 0.3652516007423401, 'eval_accuracy': 0.9267832906047353, 'eval_precision': 0.9263253547977085, 'eval_recall': 0.9267832906047353, 'eval_f1': 0.9264208214631784, 'eval_runtime': 147.0501, 'eval_samples_per_second': 270.561, 'eval_steps_per_second': 2.822, 'epoch': 10.0}


In [11]:
# Get predictions
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids

print("\nConfusion Matrix:")
print(confusion_matrix(labels, preds))

print("\nClassification Report:")
print(classification_report(labels, preds))


Confusion Matrix:
[[13494    54    72   257    28    36     9    32    34    16]
 [   60  3434    33    21    14    87    12    30    13     8]
 [   98    38  2810    16    22    83     9    69    12    20]
 [  353    30    29  1770     5     9     5    10     4     4]
 [   13    12    14     3  2127     2     2    26     2     0]
 [   28    66    70    12     1  2834     6    43     5   124]
 [    6     5    10     1     1    12  3567    29    18    11]
 [   18    17    48     5    28    35    27  3695    30     9]
 [   28    16    16     7     9    19    38    39  1775    22]
 [   23    15    29    19     2   160    23    39    38  1367]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     14032
           1       0.93      0.93      0.93      3712
           2       0.90      0.88      0.89      3177
           3       0.84      0.80      0.82      2219
           4       0.95      0.97      0.96      2201
 

In [5]:
import wandb
wandb.login(key="72d0e227429bd347553a5563b7396b82cb04a364")


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mrokaia-emad[0m ([33mrokaia-emad-modern-sciences-and-arts-university[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [12]:
def save_complete_model(model, tokenizer, category_mapping, save_path):
    model.base_model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    
    classifier_state = {
        'classifier_state': model.classifier.state_dict(),
        'num_labels': model.classifier.out_features
    }
    torch.save(classifier_state, f"{save_path}/classifier_state.pt")
    
    with open(f"{save_path}/category_mapping.pkl", "wb") as f:
        pickle.dump(category_mapping, f)

# Save the model
save_complete_model(trainer.model, tokenizer, category_mapping, "mBert")

In [13]:
import torch
from transformers import AutoModel, AutoTokenizer

# Define the function to load the complete model
def load_complete_model(model_path):
    base_model = AutoModel.from_pretrained(model_path)

    # Load classifier weights
    classifier_state = torch.load(f"{model_path}/classifier_state.pt", map_location=torch.device('cpu'))

    # Recreate the model
    model = CustomModel(base_model, classifier_state['num_labels'])

    # Load classifier weights into the model
    model.classifier.load_state_dict(classifier_state['classifier_state'])

    # Set to evaluation mode
    model.eval()
    return model

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("mBert")

# Load the model
model2 = load_complete_model("mBert")

print("Model loaded successfully!")


Model loaded successfully!


  classifier_state = torch.load(f"{model_path}/classifier_state.pt", map_location=torch.device('cpu'))


In [14]:
import torch
import numpy as np

# Function to make predictions
def predict_category(text, model, tokenizer, category_mapping):
    # Tokenize the input text
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    # Move inputs to model
    with torch.no_grad():  # No need for gradients during inference
        outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
    
    # Get the predicted class
    logits = outputs["logits"]
    predicted_label = torch.argmax(logits, dim=-1).item()

    # Reverse mapping from index to category
    category_mapping_reverse = {v: k for k, v in category_mapping.items()}
    predicted_category = category_mapping_reverse[predicted_label]

    return predicted_category


In [15]:
import pickle

# Load the category mapping
with open("mBert/category_mapping.pkl", "rb") as f:
    category_mapping = pickle.load(f)


In [19]:
# Example text
text = "ŸÜŸÇÿµ ÿ≠ÿØŸäÿØ"

# Predict category
predicted_category = predict_category(text, model2, tokenizer, category_mapping)

# Print result
print(f"Predicted Category: {predicted_category}")


Predicted Category: ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿØŸÖ


In [20]:
import shutil

# Define the folder and output zip file
folder_path = "mBert"
zip_file_name = "mBert.zip"

# Create a zip archive
shutil.make_archive(zip_file_name.replace(".zip", ""), 'zip', folder_path)

print(f"‚úÖ Folder {folder_path} compressed successfully as {zip_file_name}!")


‚úÖ Folder mBert compressed successfully as mBert.zip!


In [26]:
from google.colab import files

# Download the zip file
files.download(zip_file_name)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
import shutil

# Zip the folder
shutil.make_archive("mBert", 'zip', "mBert")

# Print the file path
print("Download your file from: /kaggle/working/mBert.zip")


Download your file from: /kaggle/working/mBert.zip


In [28]:
from google.colab import files

# Download the zip file
files.download(zip_file_name)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [22]:
from IPython.display import FileLink
FileLink(r'mBert.zip')
