In [33]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import logging
import pickle
import numpy as np



In [34]:
import wandb
wandb.login(key="c26df6b59bfb128917e73bbb00a79ca7e9324a11")




True

In [35]:
class CustomModel(nn.Module):
    def __init__(self, base_model, num_labels):  # FIX: Correct __init__ method
        super(CustomModel, self).__init__()  # FIX: Correct super() call
        self.base_model = base_model 
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(base_model.config.hidden_size, num_labels)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        return {"loss": loss, "logits": logits}


In [36]:
# Load data
train_path = "/kaggle/input/maqa-dataset/Unbalanced/Unbalanced/MAQA_Train.xlsx"
test_path = "/kaggle/input/maqa-dataset/Unbalanced/Unbalanced/MAQA_Test.xlsx"
train_df = pd.read_excel(train_path)
test_df = pd.read_excel(test_path)
all_data = pd.concat([train_df, test_df], ignore_index=True)
all_data = all_data[['q_body', 'category']]

valid_categories = [
    "ÿßŸÖÿ±ÿßÿ∂ ŸÜÿ≥ÿßÿ¶Ÿäÿ©",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿπÿ∂ŸÑÿßÿ™ ŸàÿßŸÑÿπÿ∏ÿßŸÖ Ÿà ÿßŸÑŸÖŸÅÿßÿµŸÑ",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿ¨Ÿáÿßÿ≤ ÿßŸÑŸáÿ∂ŸÖŸä",
    "ÿßŸÑÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿ¨ŸÜÿ≥Ÿäÿ©",
    "ÿ∑ÿ® ÿßŸÑÿßÿ≥ŸÜÿßŸÜ",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑŸÇŸÑÿ® Ÿà ÿßŸÑÿ¥ÿ±ÿßŸäŸäŸÜ",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿπŸäŸàŸÜ",
    "ÿßŸÜŸÅ ÿßÿ∞ŸÜ Ÿàÿ≠ŸÜÿ¨ÿ±ÿ©",
    "ÿ¨ÿ±ÿßÿ≠ÿ© ÿ™ÿ¨ŸÖŸäŸÑ",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿØŸÖ"
]

all_data = all_data[all_data["category"].isin(valid_categories)]
all_data = all_data.sample(frac=1, random_state=42).reset_index(drop=True)
train_df, test_df = train_test_split(all_data, test_size=0.2, random_state=42, stratify=all_data['category'])

In [37]:
train_df["text"] = train_df["q_body"]
test_df["text"] = test_df["q_body"]

category_mapping = {cat: i for i, cat in enumerate(valid_categories)}
train_df['label'] = train_df['category'].map(category_mapping)
test_df['label'] = test_df['category'].map(category_mapping)

model_name = "distilbert/distilbert-base-multilingual-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = Dataset.from_dict({"text": train_df["text"].tolist(), "label": train_df["label"].tolist()})
test_dataset = Dataset.from_dict({"text": test_df["text"].tolist(), "label": test_df["label"].tolist()})

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/159143 [00:00<?, ? examples/s]

Map:   0%|          | 0/39786 [00:00<?, ? examples/s]

In [38]:
base_model = AutoModel.from_pretrained(model_name)
model = CustomModel(base_model, 10)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    per_device_train_batch_size=96,
    per_device_eval_batch_size=96,
    num_train_epochs=10,
    weight_decay=0.01,
    learning_rate=3e-5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [39]:
# Train
print("\nStarting training...")
trainer.train()



Starting training...




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3689,0.320089,0.900543,0.900174,0.900543,0.899722
2,0.3045,0.277979,0.911677,0.91187,0.911677,0.91159
3,0.2173,0.264536,0.917835,0.917085,0.917835,0.917142
4,0.2092,0.26193,0.920374,0.919984,0.920374,0.919767
5,0.149,0.263887,0.921882,0.921668,0.921882,0.921579
6,0.1277,0.270877,0.92329,0.923006,0.92329,0.922936
7,0.0981,0.277409,0.924722,0.923996,0.924722,0.924132
8,0.0961,0.285048,0.92422,0.923873,0.92422,0.923984
9,0.0905,0.289697,0.925074,0.924505,0.925074,0.924674
10,0.0743,0.292377,0.925225,0.924529,0.925225,0.924773




TrainOutput(global_step=8290, training_loss=0.19336676957811463, metrics={'train_runtime': 9658.7492, 'train_samples_per_second': 164.766, 'train_steps_per_second': 0.858, 'total_flos': 0.0, 'train_loss': 0.19336676957811463, 'epoch': 10.0})

In [40]:
# Evaluate
train_metrics = trainer.evaluate(train_dataset)
print("\Train Metrics:", train_metrics)



\Train Metrics: {'eval_loss': 0.03658333793282509, 'eval_accuracy': 0.9905619474309268, 'eval_precision': 0.9905316358052546, 'eval_recall': 0.9905619474309268, 'eval_f1': 0.9905201535510255, 'eval_runtime': 328.2409, 'eval_samples_per_second': 484.836, 'eval_steps_per_second': 2.526, 'epoch': 10.0}


In [41]:
test_metrics = trainer.evaluate(test_dataset)

print("\nTest Metrics:", test_metrics)




Test Metrics: {'eval_loss': 0.29237672686576843, 'eval_accuracy': 0.9252249535012316, 'eval_precision': 0.9245294639008788, 'eval_recall': 0.9252249535012316, 'eval_f1': 0.924772673176017, 'eval_runtime': 82.5095, 'eval_samples_per_second': 482.199, 'eval_steps_per_second': 2.521, 'epoch': 10.0}


In [42]:
# Get predictions
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids

print("\nConfusion Matrix:")
print(confusion_matrix(labels, preds))

print("\nClassification Report:")
print(classification_report(labels, preds))




Confusion Matrix:
[[13529    56    67   240    33    27     5    22    37    16]
 [   66  3453    29    24    16    73    11    18    13     9]
 [  101    45  2793    17    28    76    12    60    18    27]
 [  386    24    33  1744     2     6     5     5     8     6]
 [   19    10     4     2  2133     1     0    27     4     1]
 [   41    81    62    13     3  2786    10    51     8   134]
 [    6     5    11     2     7    12  3557    27    17    16]
 [   25    26    50     8    32    30    31  3651    37    22]
 [   31    12    17    11    11    15    36    38  1765    33]
 [   24    15    33    18     6   119    24    39    37  1400]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.96     14032
           1       0.93      0.93      0.93      3712
           2       0.90      0.88      0.89      3177
           3       0.84      0.79      0.81      2219
           4       0.94      0.97      0.95      2201
 

In [43]:
def save_complete_model(model, tokenizer, category_mapping, save_path):
    model.base_model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    
    classifier_state = {
        'classifier_state': model.classifier.state_dict(),
        'num_labels': model.classifier.out_features
    }
    torch.save(classifier_state, f"{save_path}/classifier_state.pt")
    
    with open(f"{save_path}/category_mapping.pkl", "wb") as f:
        pickle.dump(category_mapping, f)

# Save the model
save_complete_model(trainer.model, tokenizer, category_mapping, "arabic_text_classifier_final")

In [44]:
import torch
from transformers import AutoModel, AutoTokenizer

# Define the function to load the complete model
def load_complete_model(model_path):
    base_model = AutoModel.from_pretrained(model_path)

    # Load classifier weights
    classifier_state = torch.load(f"{model_path}/classifier_state.pt", map_location=torch.device('cpu'))

    # Recreate the model
    model = CustomModel(base_model, classifier_state['num_labels'])

    # Load classifier weights into the model
    model.classifier.load_state_dict(classifier_state['classifier_state'])

    # Set to evaluation mode
    model.eval()
    return model

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("arabic_text_classifier_final")

# Load the model
model2 = load_complete_model("arabic_text_classifier_final")

print("Model loaded successfully!")


Model loaded successfully!


  classifier_state = torch.load(f"{model_path}/classifier_state.pt", map_location=torch.device('cpu'))


In [45]:
import torch
import numpy as np

# Function to make predictions
def predict_category(text, model, tokenizer, category_mapping):
    # Tokenize the input text
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    # Move inputs to model
    with torch.no_grad():  # No need for gradients during inference
        outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
    
    # Get the predicted class
    logits = outputs["logits"]
    predicted_label = torch.argmax(logits, dim=-1).item()

    # Reverse mapping from index to category
    category_mapping_reverse = {v: k for k, v in category_mapping.items()}
    predicted_category = category_mapping_reverse[predicted_label]

    return predicted_category


In [46]:
import pickle

# Load the category mapping
with open("arabic_text_classifier_final/category_mapping.pkl", "rb") as f:
    category_mapping = pickle.load(f)


In [53]:
# Example text
text = "ÿ®ÿ±ÿØ ÿ¥ÿØŸäÿØ ŸÅŸä ÿßŸÑŸÖÿπÿØŸá"

# Predict category
predicted_category = predict_category(text, model2, tokenizer, category_mapping)

# Print result
print(f"Predicted Category: {predicted_category}")


Predicted Category: ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿ¨Ÿáÿßÿ≤ ÿßŸÑŸáÿ∂ŸÖŸä


In [48]:
import shutil

# Define the folder and output zip file
folder_path = "arabic_text_classifier_final"
zip_file_name = "arabic_text_classifier_final.zip"

# Create a zip archive
shutil.make_archive(zip_file_name.replace(".zip", ""), 'zip', folder_path)

print(f"‚úÖ Folder {folder_path} compressed successfully as {zip_file_name}!")


‚úÖ Folder arabic_text_classifier_final compressed successfully as arabic_text_classifier_final.zip!


In [49]:
from google.colab import files

# Download the zip file
files.download(zip_file_name)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [50]:
import shutil

# Zip the folder
shutil.make_archive("arabic_text_classifier_final", 'zip', "arabic_text_classifier_final")

# Print the file path
print("Download your file from: /kaggle/working/arabic_text_classifier_final.zip")


Download your file from: /kaggle/working/arabic_text_classifier_final.zip


In [51]:
from google.colab import files

# Download the zip file
files.download(zip_file_name)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [52]:
from IPython.display import FileLink
FileLink(r'arabic_text_classifier_final.zip')
