<a href="https://colab.research.google.com/github/Tabook22/AI/blob/main/WSD12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install torch transformers pandas scikit-learn arabert

Collecting arabert
  Downloading arabert-1.0.1-py3-none-any.whl (179 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.

In [None]:
# Import required modules
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
import json
import pandas as pd
from sklearn.metrics import accuracy_score
from transformers import BertTokenizerFast, BertForSequenceClassification
from arabert.preprocess import ArabertPreprocessor

In [None]:
# Function to load JSON data
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)


In [None]:
def create_gloss_dictionary(wsd_data):
    #This part iterates over each item in wsd_data, and for each item, it extracts gloss_id as the key and gloss as the value, forming a key-value pair in the resulting dictionary.
    gloss_dict = {item['gloss_id']: item['gloss'] for item in wsd_data} #The core of the function is a dictionary comprehension
    return gloss_dict


In [None]:
# Load data
train_data = load_data('train.json')
test_data = load_data('test.json')
wsd_data = load_data('WSD_dict.json')

gloss_dict = create_gloss_dictionary(wsd_data)

In [None]:
gloss_dict = create_gloss_dictionary(wsd_data)
# Convert the dictionary to a Pandas DataFrame
gloss_df = pd.DataFrame.from_dict(gloss_dict, orient='index')
gloss_df.columns = ['Gloss']

# Print the first 10 rows of the DataFrame
print(gloss_df.head(10))

                                                       Gloss
gloss.200                        ما يواجَه من عقبات أو أخطار
gloss.201  مباراة رياضيّة تتيح للفائز الاحتفاظ بشيء إلى أ...
gloss.202  نظريّة في فلسفة التاريخ مؤدّاها: أنّ الحضارة ت...
gloss.203                    مشكلة أو أُحجيّة محفِّزة للعقل.
gloss.204                       امتحان لقدرات المرء وطاقاتِه
gloss.205  كُلُّ وسيلة لتبادل المشاعر والأفكار كالإشارات ...
gloss.206            أصواتٌ يعبِّر بها كُلُّ قومٍ عن أغراضهم
gloss.207  اللُّغة المتداولة بين النَّاس، وهي بخلاف اللُّ...
gloss.208  لغة خاصّة بأهل حرفة أو طبقة لا يفهمها غير أفرا...
gloss.209  من يتكلّم لغتين على مستوى واحد سواء أكان فردًا...


In [None]:
# Get the unique labels from the dataset
unique_labels = set() #unordered collections of unique elements. Duplicate elements are not allowed in a set
for data in [train_data, test_data]:
    for item in data:
        if 'gloss_id' in item:
            label = int(item['gloss_id'].split('.')[-1])
            unique_labels.add(label)

In [None]:
# Create dictionary from the data
gloss_dict = create_gloss_dictionary(wsd_data) #The create_gloss_dictionary function converts this list into a dictionary.

# Convert dictionary to DataFrame
df = pd.DataFrame(list(gloss_dict.items()), columns=['Gloss ID', 'Gloss']) #The dictionary is converted into a DataFrame where the keys become one column (Gloss ID) and the values become another (Gloss).

# Display the first 10 items in tabular format
print(df.head(10))

    Gloss ID                                              Gloss
0  gloss.200                        ما يواجَه من عقبات أو أخطار
1  gloss.201  مباراة رياضيّة تتيح للفائز الاحتفاظ بشيء إلى أ...
2  gloss.202  نظريّة في فلسفة التاريخ مؤدّاها: أنّ الحضارة ت...
3  gloss.203                    مشكلة أو أُحجيّة محفِّزة للعقل.
4  gloss.204                       امتحان لقدرات المرء وطاقاتِه
5  gloss.205  كُلُّ وسيلة لتبادل المشاعر والأفكار كالإشارات ...
6  gloss.206            أصواتٌ يعبِّر بها كُلُّ قومٍ عن أغراضهم
7  gloss.207  اللُّغة المتداولة بين النَّاس، وهي بخلاف اللُّ...
8  gloss.208  لغة خاصّة بأهل حرفة أو طبقة لا يفهمها غير أفرا...
9  gloss.209  من يتكلّم لغتين على مستوى واحد سواء أكان فردًا...


In [None]:
# Create a label mapping dictionary
label_mapping = {label: idx for idx, label in enumerate(sorted(unique_labels))}

In [None]:
# Define a PyTorch dataset for WSD
class WSDataset(Dataset):
    def __init__(self, data, tokenizer, gloss_dict, max_len=128):
        self.data = data
        self.tokenizer = tokenizer # The tokenizer breaking down a text into smaller units, such as words, phrases, symbols, or even individual characters. makes it easy for model to process and analyze.
        self.gloss_dict = gloss_dict #contains and explains the meanings of words (tricky words!)
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        context = item['context']

        if 'gloss_id' in item:
            gloss_id = item['gloss_id']
            gloss = self.gloss_dict.get(gloss_id, '')
            input_text = context + " [SEP] " + gloss
            label = int(gloss_id.split('.')[-1])
            if label in label_mapping:
                label = label_mapping[label]
            else:
                label = -1  # Assign a default label for invalid labels
        else:
            input_text = context
            label = -1  # Assign a default label for items without 'gloss_id'

        encoded = self.tokenizer.encode_plus(
            input_text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True
        )
        return {
            'input_ids': torch.tensor(encoded['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(encoded['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Get the number of unique labels
unique_labels = set()
for data in [train_data, test_data]:
    for item in data:
        if 'gloss_id' in item:
            label = int(item['gloss_id'].split('.')[-1])
            unique_labels.add(label)

num_labels = len(unique_labels)
print(f"Number of unique labels: {num_labels}")

Number of unique labels: 5460


In [None]:
# Check the maximum label value
max_label = 0
for data in [train_data, test_data]:
    for item in data:
        if 'gloss_id' in item:
            label = int(item['gloss_id'].split('.')[-1])
            max_label = max(max_label, label)

print(f"Maximum label value: {max_label}")

Maximum label value: 16070


In [None]:
#This is using Huggingface Tokens
# Initialize the tokenizer and model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name="aubmindlab/bert-large-arabertv02"
huggingface_token = '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'  # Replace with your actual token

tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=huggingface_token)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(gloss_dict), use_auth_token=huggingface_token)



tokenizer_config.json:   0%|          | 0.00/382 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/825k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.48G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-large-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Create data loaders
train_dataset = WSDataset(train_data, tokenizer, gloss_dict)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

test_dataset = WSDataset(test_data, tokenizer, gloss_dict)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)



In [None]:
# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #Checks if a GPU is available and sets the device to train on (cuda for GPU or cpu otherwise).
model.to(device) #Moves the model to the chosen device (cuda for GPU or cpu).

num_epochs = 3  # Number of epochs can be adjusted, they are now 3 by default. This means the training loop will iterate over the entire training dataset three times.
for epoch in range(num_epochs):
    model.train() #In PyTorch, the model.train() method sets the model into training mode. model.train() is essential for proper model training in PyTorch.
    total_loss = 0 # initializing total_loss to 0. This variable is used to accumulate the total training loss for the current epoch (iteration over the entire training dataset).
    skipped_samples = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Skip samples with out-of-bounds labels
        valid_indices = torch.where(labels != -1)[0]
        if len(valid_indices) == 0:
            skipped_samples += len(labels)
            continue

        input_ids = input_ids[valid_indices]
        attention_mask = attention_mask[valid_indices]
        labels = labels[valid_indices]

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss / (len(train_loader) - skipped_samples)}, Skipped Samples: {skipped_samples}')

Epoch 1, Loss: 8.608509268954684, Skipped Samples: 0
Epoch 2, Loss: 8.091054814275378, Skipped Samples: 0
Epoch 3, Loss: 8.044861339484683, Skipped Samples: 0


In [None]:
# Testing loop
model.eval()
predictions, true_labels = [], []
for batch in test_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=-1).tolist())
        true_labels.extend(labels.tolist())

In [None]:
# Evaluate model performance
accuracy = accuracy_score(true_labels, predictions)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.0057
