<a href="https://colab.research.google.com/github/Tabook22/AI/blob/main/WSD_V6_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install numpy tqdm scikit-learn transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [2]:
import json
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from transformers import AutoTokenizer, AutoModel
import torch
import pickle

In [3]:
# Load CAMeLBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa")
model = AutoModel.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/468 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/305k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [4]:
# Function to load JSON data
def load_json(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

In [5]:
# Prepare labeled data for training and evaluation
def prepare_labeled_data(data, wsd_dict):
    labeled_data = []
    for entry in data:
        word = entry['word']
        lemma_id = entry['lemma_id']
        correct_gloss_id = entry.get('gloss_id')
        for sense in wsd_dict[lemma_id]:
            labeled_data.append({
                'context_id': entry['context_id'],
                'context': entry['context'],
                'word': word,
                'lemma_id': lemma_id,
                'gloss_id': sense['gloss_id'],
                'gloss': sense['gloss'],
                'label': 1 if sense['gloss_id'] == correct_gloss_id else 0
            })
    return labeled_data

In [6]:
# Extract or fetch cached embeddings for a given text
embedding_cache = defaultdict(lambda: None)

In [7]:
# Updated encode_text function with truncation
def encode_text(text):
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    return tokens

def extract_embeddings(text):
    tokens = encode_text(text)
    with torch.no_grad():
        output = model(**tokens)
    return output.last_hidden_state.mean(dim=1).squeeze().numpy()

def extract_embeddings_with_cache(text):
    if embedding_cache[text] is None:
        embedding_cache[text] = extract_embeddings(text)
    return embedding_cache[text]

In [8]:
# Prepare features for classifier input
def prepare_features(data):
    features = []
    for entry in tqdm(data, desc="Extracting Features", unit="pair"):
        context_emb = extract_embeddings_with_cache(entry['context'])
        gloss_emb = extract_embeddings_with_cache(entry['gloss'])
        label = entry['label']
        features.append((context_emb, gloss_emb, label))
    return features

In [9]:
# Concatenate features for classifier input (combining the embedding of the context with the embeddings of the gloss) that will help the classifer(logistic regressopm)
def concatenate_features(features):
    X, y = [], []
    for context_emb, gloss_emb, label in features:
        X.append(np.concatenate((context_emb, gloss_emb)))
        y.append(label)
    return np.array(X), np.array(y)

In [10]:
# Load all datasets
train_data = load_json('train.json')
dev_data = load_json('dev.json')
wsd_dict_raw = load_json('WSD_dict.json')

In [11]:
# Prepare WSD dictionary
wsd_dict = defaultdict(list)
for entry in wsd_dict_raw:
    wsd_dict[entry['lemma_id']].append({
        'gloss_id': entry['gloss_id'],
        'gloss': entry['gloss']
    })

In [12]:
# Prepare labeled data
train_labeled = prepare_labeled_data(train_data, wsd_dict)
dev_labeled = prepare_labeled_data(dev_data, wsd_dict)

In [13]:
# Extract features for training and development
train_features = prepare_features(train_labeled)
dev_features = prepare_features(dev_labeled)

Extracting Features: 100%|██████████| 115424/115424 [32:51<00:00, 58.54pair/s] 
Extracting Features: 100%|██████████| 14834/14834 [00:59<00:00, 249.42pair/s]


In [14]:
# Prepare feature matrices
X_train, y_train = concatenate_features(train_features)
X_dev, y_dev = concatenate_features(dev_features)

In [15]:
#Grid Search for Optimal Weights: here we are using fixed weights
#Perform a grid search to find optimal weights for each class.
from sklearn.model_selection import GridSearchCV

param_grid = {
    'class_weight': [{0: 1, 1: 2}, {0: 1, 1: 3}, {0: 1, 1: 4}, {0: 1, 1: 5}]
}
grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, scoring='accuracy', cv=3)
grid.fit(X_train, y_train)
print(grid.best_params_)

{'class_weight': {0: 1, 1: 2}}


In [20]:
## Train a logistic regression classifier with class weight adjustment
## you can refere to the above step and select the best weight
class_weights = {0: 1, 1: 2}
#clf = LogisticRegression(max_iter=1000, class_weight='balanced') # here am trying the balanced weights, now weight agjustement is needed
clf = LogisticRegression(max_iter=1000, class_weight=class_weights) # here am trying fixed weights
clf.fit(X_train, y_train)

In [21]:
# Save the trained model
with open('wsd_model.pkl', 'wb') as model_file:
    pickle.dump(clf, model_file)

# Save the WSD dictionary
with open('wsd_dict.json', 'w', encoding='utf-8') as dict_file:
    json.dump(wsd_dict, dict_file)

In [22]:
# Evaluate the model on the development dataset
y_pred = clf.predict(X_dev)
accuracy = accuracy_score(y_dev, y_pred)
print(f"Development Accuracy: {accuracy:.4f}")

# Generate a detailed classification report
print(classification_report(y_dev, y_pred, target_names=['Incorrect Sense', 'Correct Sense']))

Development Accuracy: 0.8271
                 precision    recall  f1-score   support

Incorrect Sense       0.90      0.89      0.89     12033
  Correct Sense       0.54      0.57      0.56      2801

       accuracy                           0.83     14834
      macro avg       0.72      0.73      0.72     14834
   weighted avg       0.83      0.83      0.83     14834



In [23]:
# Download files directly
from google.colab import files
files.download('wsd_model.pkl')
files.download('wsd_dict.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>