# Bilingual Sentiment Project (Arabic + English)

هذا الدفتر يدرب نموذجين لتحليل المشاعر (عربي + إنجليزي) ثم يصدّرهما بصيغ:
- `SavedModel/`
- `.keras`
- `.h5`
- مع `tokenizer.json` + `label_map.json`

> **المتطلبات المقترحة:**  
> - TensorFlow >= 2.16 (يفضّل 2.20)  
> - Python 3.10  
> - pandas, numpy, scikit-learn


## (اختياري) تثبيت الإصدارات المتوافقة

In [5]:
# شغّل خلية واحدة فقط حسب بيئتك ثم أعد تشغيل النواة (Kernel Restart).
# خيار عام (Linux/Windows/Cloud):
# !pip install -q --upgrade pip
# !pip uninstall -y tensorflow tensorflow-cpu tensorflow-macos tensorflow-metal keras tensorboard protobuf grpcio
# !pip install -q "tensorflow==2.20.0" "pandas==2.2.2" "numpy==1.26.4" "scikit-learn==1.4.2" "h5py==3.11.0"
print("If you installed packages here, please restart the kernel and run from top.")

If you installed packages here, please restart the kernel and run from top.


## 1) الاستيراد والإعداد العام

In [8]:
import os, re, json
from pathlib import Path
import numpy as np, pandas as pd, tensorflow as tf
from sklearn.model_selection import train_test_split
from collections import Counter

from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models

print("TensorFlow:", tf.__version__)

# إعدادات عامة
MAX_LEN   = 96
NUM_WORDS = 20000
CLASSES   = ['negative','neutral','positive']

# مجلد الإخراج
OUT_DIR = Path('bilingual_sentiment_model')
(OUT_DIR/'ar').mkdir(parents=True, exist_ok=True)
(OUT_DIR/'en').mkdir(parents=True, exist_ok=True)

# كشف العربية ببساطة
AR_RE = re.compile(r'[\u0600-\u06FF]')
def detect_lang(s): 
    return "ar" if AR_RE.search(str(s)) else "en"

# تطبيع خفيف للنص العربي
AR_DIACRITICS = r"[\u0617-\u061A\u064B-\u0652\u0670]"
def ar_normalize(s: str) -> str:
    s = str(s)
    s = re.sub(AR_DIACRITICS, "", s)            # إزالة التشكيل
    s = re.sub(r"[ـ]+", "", s)                  # إزالة التطويل
    s = s.replace("أ","ا").replace("إ","ا").replace("آ","ا")
    s = s.replace("ى","ي").replace("ؤ","و").replace("ئ","ي").replace("ة","ه")
    s = re.sub(r"\s+", " ", s).strip()
    return s



TensorFlow: 2.16.1


## 2) تحميل CSV بذكاء (ترميزات وأسماء أعمدة مختلفة)

In [11]:
# نحاول عدة ترميزات شائعة: utf-8 / utf-8-sig / latin-1 / cp1256
def read_csv_any_encoding(path):
    for enc in ("utf-8","utf-8-sig","latin-1","cp1256"):
        try:
            return pd.read_csv(path, encoding=enc)
        except UnicodeDecodeError:
            continue
    # أخيرًا بدون تحديد
    return pd.read_csv(path)

# مرونة في أسماء الأعمدة
TEXT_COLS  = {'text','review','sentence','tweet','comment','message','content','body','clean_text','normalized_text','comment_text','translated_text','Text'}
LABEL_COLS = {'label','labels','sentiment','target','class','polarity','emotion','category','y','rating','Label','Sentiment'}
LANG_COLS  = {'lang','language','lang_id','lang_code','iso_lang','Language'}

def pick_col(orig_cols, candidates):
    mapping = {c.lower(): c for c in orig_cols}
    # تطابق مباشر (case-insensitive)
    for cand in candidates:
        if cand in mapping: 
            return mapping[cand]
    # تطابق يحتوي
    for c in orig_cols:
        lc = c.lower()
        if any(cand in lc for cand in candidates):
            return c
    return None

def normalize_label(v):
    s = str(v).strip().lower()
    # ترميزات رقمية شائعة
    if s in {'-1','0','1','2'}:
        return {'-1':'negative','0':'negative','1':'positive','2':'neutral'}.get(s)
    # English
    if s in {'neg','negative','bad'}: return 'negative'
    if s in {'neu','neutral'}:       return 'neutral'
    if s in {'pos','positive','good'}: return 'positive'
    # Arabic
    if s in {'سلبي','سلبية','سيء','سيئ','سئ'}: return 'negative'
    if s in {'محايد','محايده'}: return 'neutral'
    if s in {'ايجابي','إيجابي','ايجابية','إيجابية','جيد','ممتاز'}: return 'positive'
    return None

def load_concat_smart(files):
    frames = []
    for p in files:
        if not os.path.exists(p):
            print(f"⚠️ Missing: {p} — skipping")
            continue
        df = read_csv_any_encoding(p)
        orig_cols = list(df.columns)
        text_c  = pick_col(orig_cols, {c.lower() for c in TEXT_COLS})
        label_c = pick_col(orig_cols, {c.lower() for c in LABEL_COLS})
        lang_c  = pick_col(orig_cols, {c.lower() for c in LANG_COLS})

        if text_c is None or label_c is None:
            raise ValueError(f"{p} lacks recognizable text/label columns. Found: {orig_cols}")

        df = df.rename(columns={text_c: 'text', label_c: 'label'})
        if lang_c: 
            df = df.rename(columns={lang_c: 'lang'})
        if 'lang' not in df.columns:
            df['lang'] = df['text'].apply(detect_lang)

        # تطبيع عربي للنصوص العربية فقط (اختياري لكنه يساعد)
        mask_ar = df['lang'] == 'ar'
        df.loc[mask_ar, 'text'] = df.loc[mask_ar, 'text'].astype(str).apply(ar_normalize)

        # توحيد الليبل
        df['label'] = df['label'].apply(normalize_label)
        df = df.dropna(subset=['text','label','lang'])
        df = df[df['label'].isin(CLASSES)]
        frames.append(df[['text','label','lang']])
        print(f"✅ {p}: rows={len(df)} standardized.")
    if not frames:
        raise ValueError("No valid CSVs found.")
    out = pd.concat(frames, ignore_index=True).drop_duplicates(subset=['text','label','lang'])
    return out

### حمّل ملفاتك هنا

In [14]:
# عدّل الأسماء حسب ملفاتك
TRAIN_FILES = ['train.csv', 'train_all.csv']

df = load_concat_smart(TRAIN_FILES)
print("Total rows:", len(df))
df.head(10)

✅ train.csv: rows=27480 standardized.
✅ train_all.csv: rows=55000 standardized.
Total rows: 81728


Unnamed: 0,text,label,lang
0,"I`d have responded, if I were going",neutral,en
1,Sooo SAD I will miss you here in San Diego!!!,negative,en
2,my boss is bullying me...,negative,en
3,what interview! leave me alone,negative,en
4,"Sons of ****, why couldn`t they put them on t...",negative,en
5,http://www.dothebouncy.com/smf - some shameles...,neutral,en
6,2am feedings for the baby are fun when he is a...,positive,en
7,Soooo high,neutral,en
8,Both of you,neutral,en
9,Journey!? Wow... u just became cooler. hehe....,positive,en


## 3) تحضير بيانات لكل لغة

In [17]:
def build_lang_data(df_lang):
    texts = df_lang['text'].astype(str).tolist()
    labels = df_lang['label'].map({'negative':0,'neutral':1,'positive':2}).values
    tok = Tokenizer(num_words=NUM_WORDS, oov_token='[OOV]')
    tok.fit_on_texts(texts)
    X = pad_sequences(tok.texts_to_sequences(texts), maxlen=MAX_LEN, padding='post', truncating='post')
    y = labels
    return X, y, tok

def safe_split(X, y, test_size=0.2, random_state=42):
    cnt = Counter(y)
    strat = y if (len(cnt)>0 and min(cnt.values())>=2) else None
    return train_test_split(X, y, test_size=test_size, random_state=random_state, shuffle=True, stratify=strat)

df_ar = df[df.lang=='ar'].copy()
df_en = df[df.lang=='en'].copy()
print("Rows AR:", len(df_ar), "Rows EN:", len(df_en))

X_ar, y_ar, tok_ar = (np.empty((0,MAX_LEN)), np.array([]), None)
X_en, y_en, tok_en = (np.empty((0,MAX_LEN)), np.array([]), None)

if len(df_ar):
    X_ar, y_ar, tok_ar = build_lang_data(df_ar)
if len(df_en):
    X_en, y_en, tok_en = build_lang_data(df_en)

Xtr_ar, Xva_ar, ytr_ar, yva_ar = safe_split(X_ar, y_ar) if len(df_ar) else (X_ar,X_ar,y_ar,y_ar)
Xtr_en, Xva_en, ytr_en, yva_en = safe_split(X_en, y_en) if len(df_en) else (X_en,X_en,y_en,y_en)

print("AR shapes:", Xtr_ar.shape, Xva_ar.shape, "EN shapes:", Xtr_en.shape, Xva_en.shape)

Rows AR: 54244 Rows EN: 27484
AR shapes: (43395, 96) (10849, 96) EN shapes: (21987, 96) (5497, 96)


## 4) بناء الموديل

In [20]:
def make_model(vocab_size=NUM_WORDS, num_classes=3, max_len=MAX_LEN):
    inp = layers.Input(shape=(max_len,), dtype='int32')
    x = layers.Embedding(vocab_size, 128, mask_zero=True)(inp)
    x = layers.Bidirectional(layers.LSTM(64))(x)
    x = layers.Dense(128, activation='relu')(x)
    out = layers.Dense(num_classes, activation='softmax')(x)
    model = models.Model(inp, out)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

callbacks = [tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True, monitor='val_accuracy')]

## 5) تدريب عربي + إنجليزي

In [23]:
# ===== Rebuild helpers (right-padding=0) =====
import numpy as np, pandas as pd, re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_LEN   = 96
NUM_WORDS = 20000
CLASSES   = ['negative', 'neutral', 'positive']
LABEL2ID  = {c:i for i,c in enumerate(CLASSES)}

_AR_DIAC = r"[\u0617-\u061A\u064B-\u0652\u0670]"
def ar_norm(s):
    s = str(s)
    s = re.sub(_AR_DIAC, "", s)
    s = re.sub(r"[ـ]+", "", s)
    s = s.replace("أ","ا").replace("إ","ا").replace("آ","ا")
    s = s.replace("ى","ي").replace("ؤ","و").replace("ئ","ي").replace("ة","ه")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def build_lang_data(df_lang: pd.DataFrame, is_ar: bool):
    texts = df_lang['text'].astype(str).tolist()
    if is_ar:
        texts = [ar_norm(t) for t in texts]
    y = df_lang['label'].map(LABEL2ID).values

    tok = Tokenizer(num_words=NUM_WORDS, oov_token="<OOV>")
    tok.fit_on_texts(texts)
    seqs = tok.texts_to_sequences(texts)
    # right-padding بقيمة 0
    X = pad_sequences(seqs, maxlen=MAX_LEN, padding="post", truncating="post", value=0)
    return X, y, tok

def assert_right_padded(X):
    # يتأكد أن كل صف بعد آخر nonzero كله أصفار
    bad = 0
    for row in X:
        nz = np.nonzero(row)[0]
        if len(nz):
            last = nz[-1]
            if np.any(row[:np.max([0, last])] == 0):  # أصفار داخلية قبل آخر nonzero
                bad += 1
    if bad:
        print(f"⚠️ Found {bad} sequences with internal zeros (not strictly right-padded).")
    else:
        print("✅ Sequences look right-padded.")

# ===== Model WITHOUT masks =====
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers as L

def make_model(vocab_size=NUM_WORDS, num_classes=len(CLASSES), max_len=MAX_LEN, emb_dim=128):
    inp = L.Input(shape=(max_len,), dtype="int32")
    # لا نستخدم mask_zero إطلاقًا
    x = L.Embedding(vocab_size, emb_dim, mask_zero=False)(inp)
    # نوقف cuDNN صراحة (حتى لو بدون ماسك)
    x = L.Bidirectional(L.LSTM(64, return_sequences=True, use_cudnn=False))(x)
    x = L.Bidirectional(L.LSTM(32, use_cudnn=False))(x)
    x = L.Dense(64, activation="relu")(x)
    out = L.Dense(num_classes, activation="softmax")(x)
    model = keras.Model(inp, out)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return model

In [25]:
# افترضي أن df_ar و df_en جاهزين وبهم أعمدة: text, label
X_ar, y_ar, tok_ar = build_lang_data(df_ar, is_ar=True)
X_en, y_en, tok_en = build_lang_data(df_en, is_ar=False)

assert_right_padded(X_ar)
assert_right_padded(X_en)

from sklearn.model_selection import train_test_split
Xtr_ar, Xva_ar, ytr_ar, yva_ar = train_test_split(X_ar, y_ar, test_size=0.2, random_state=42, stratify=y_ar)
Xtr_en, Xva_en, ytr_en, yva_en = train_test_split(X_en, y_en, test_size=0.2, random_state=42, stratify=y_en)

✅ Sequences look right-padded.
✅ Sequences look right-padded.


In [42]:
callbacks = [
    keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=3, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-5),
]

model_ar, model_en = None, None

if len(Xtr_ar):
    print("Training AR model...")
    model_ar = make_model()
    model_ar.fit(Xtr_ar, ytr_ar, validation_data=(Xva_ar, yva_ar),
                 epochs=8, batch_size=32, verbose=2, callbacks=callbacks)
else:
    print("⚠️ No Arabic samples — skipping AR training.")

if len(Xtr_en):
    print("Training EN model...")
    model_en = make_model()
    model_en.fit(Xtr_en, ytr_en, validation_data=(Xva_en, yva_en),
                 epochs=8, batch_size=32, verbose=2, callbacks=callbacks)
else:
    print("⚠️ No English samples — skipping EN training.")

Training AR model...
Epoch 1/8
1357/1357 - 172s - 127ms/step - accuracy: 0.7458 - loss: 0.6188 - val_accuracy: 0.7803 - val_loss: 0.5417 - learning_rate: 0.0010
Epoch 2/8
1357/1357 - 175s - 129ms/step - accuracy: 0.8311 - loss: 0.4126 - val_accuracy: 0.7734 - val_loss: 0.5591 - learning_rate: 0.0010
Epoch 3/8
1357/1357 - 185s - 137ms/step - accuracy: 0.8868 - loss: 0.2811 - val_accuracy: 0.7618 - val_loss: 0.6800 - learning_rate: 0.0010
Epoch 4/8
1357/1357 - 177s - 131ms/step - accuracy: 0.9420 - loss: 0.1495 - val_accuracy: 0.7476 - val_loss: 1.0228 - learning_rate: 5.0000e-04
Training EN model...
Epoch 1/8
688/688 - 96s - 139ms/step - accuracy: 0.6331 - loss: 0.8102 - val_accuracy: 0.6973 - val_loss: 0.7206 - learning_rate: 0.0010
Epoch 2/8
688/688 - 91s - 132ms/step - accuracy: 0.7635 - loss: 0.5767 - val_accuracy: 0.7157 - val_loss: 0.6987 - learning_rate: 0.0010
Epoch 3/8
688/688 - 91s - 132ms/step - accuracy: 0.8455 - loss: 0.4081 - val_accuracy: 0.6938 - val_loss: 0.7460 - learn

In [27]:
def export_all(lang, model, tok, out_dir="bilingual_sentiment_model"):
    lang_dir = Path(out_dir) / lang
    lang_dir.mkdir(parents=True, exist_ok=True)
    # Save in multiple formats
    model.save(lang_dir / f"{lang}_best.keras", include_optimizer=False)
    model.save(lang_dir / f"{lang}_best.h5", include_optimizer=False)
    model.export(lang_dir / "saved_model")   # للـ TensorFlow Serving / TFLite
    # Save tokenizer + label map
    (lang_dir / "tokenizer.json").write_text(tok.to_json(), encoding="utf-8")
    (lang_dir / "label_map.json").write_text(
        json.dumps({"classes": CLASSES}, ensure_ascii=False, indent=2),
        encoding="utf-8"
    )
    print(f"✅ Exported {lang} model to {lang_dir.resolve()}")

In [None]:
callbacks = [
    keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=3, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-5),
]

model_ar, model_en = None, None
tok_ar, tok_en = None, None  # هنا نخزن التوكنـايزر لكل لغة

if len(Xtr_ar):
    print("Training AR model...")
    model_ar = make_model()
    model_ar.fit(
        Xtr_ar, ytr_ar,
        validation_data=(Xva_ar, yva_ar),
        epochs=8, batch_size=32, verbose=2,
        callbacks=callbacks
    )
    tok_ar = tok_ar  # هذا لازم يكون من مرحلة الـ preprocessing
else:
    print("⚠️ No Arabic samples — skipping AR training.")

if len(Xtr_en):
    print("Training EN model...")
    model_en = make_model()
    model_en.fit(
        Xtr_en, ytr_en,
        validation_data=(Xva_en, yva_en),
        epochs=8, batch_size=32, verbose=2,
        callbacks=callbacks
    )
    tok_en = tok_en  # نفس الشيء هنا
else:
    print("⚠️ No English samples — skipping EN training.")

Training AR model...


2025-09-09 19:43:09.860319: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2025-09-09 19:43:09.860489: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-09-09 19:43:09.860501: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2025-09-09 19:43:09.860688: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-09-09 19:43:09.860709: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/8


2025-09-09 19:43:10.707279: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


1357/1357 - 171s - 126ms/step - accuracy: 0.7486 - loss: 0.6172 - val_accuracy: 0.7656 - val_loss: 0.5633 - learning_rate: 0.0010
Epoch 2/8
1357/1357 - 181s - 133ms/step - accuracy: 0.8335 - loss: 0.4149 - val_accuracy: 0.7792 - val_loss: 0.5742 - learning_rate: 0.0010
Epoch 3/8
1357/1357 - 180s - 133ms/step - accuracy: 0.8908 - loss: 0.2779 - val_accuracy: 0.7531 - val_loss: 0.6795 - learning_rate: 0.0010
Epoch 4/8


## 6) التصدير (SavedModel + .keras + .h5 + tokenizer/label_map)

In [48]:
from pathlib import Path
import json

OUT_DIR = Path("bilingual_sentiment_model")
CLASSES = ["negative", "neutral", "positive"]  # ثبتيها حسب بياناتك

def export_all(lang, model, tok):
    lang_dir = OUT_DIR / lang
    lang_dir.mkdir(parents=True, exist_ok=True)

    # 1) Keras format (.keras)
    model.save(lang_dir / f"{lang}_best.keras", include_optimizer=False)

    # 2) H5 format (.h5)
    model.save(lang_dir / f"{lang}_best.h5", include_optimizer=False)

    # 3) SavedModel (في Keras 3 نستخدم export بدل save)
    sm_dir = lang_dir / "saved_model"
    sm_dir.mkdir(exist_ok=True)
    model.export(str(sm_dir))

    # 4) Tokenizer & Label Map
    (lang_dir / "tokenizer.json").write_text(tok.to_json(), encoding="utf-8")
    (lang_dir / "label_map.json").write_text(
        json.dumps({"classes": CLASSES}, ensure_ascii=False, indent=2),
        encoding="utf-8"
    )

    print(f"✅ Exported {lang} → {lang_dir.resolve()}")

# حفظ العربي والإنجليزي إذا متوفرين
if model_ar is not None and tok_ar is not None:
    export_all("ar", model_ar, tok_ar)

if model_en is not None and tok_en is not None:
    export_all("en", model_en, tok_en)



INFO:tensorflow:Assets written to: bilingual_sentiment_model/ar/saved_model/assets


INFO:tensorflow:Assets written to: bilingual_sentiment_model/ar/saved_model/assets


Saved artifact at 'bilingual_sentiment_model/ar/saved_model'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 96), dtype=tf.int32, name='keras_tensor_68')
Output Type:
  TensorSpec(shape=(None, 3), dtype=tf.float32, name=None)
Captures:
  13101421440: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14634859360: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14634857776: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14634860240: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14634845104: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14634857248: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14634858304: TensorSpec(shape=(), dtype=tf.resource, name=None)
  6326205632: TensorSpec(shape=(), dtype=tf.resource, name=None)
  6326203168: TensorSpec(shape=(), dtype=tf.resource, name=None)
  6326205808: TensorSpec(shape=(), dtype=tf.resource, name=None)
  6325997232: TensorSpec(shape=(), dtype=tf



✅ Exported ar → /Users/ranasalloum/000/bilingual_sentiment_model/ar
INFO:tensorflow:Assets written to: bilingual_sentiment_model/en/saved_model/assets


INFO:tensorflow:Assets written to: bilingual_sentiment_model/en/saved_model/assets


Saved artifact at 'bilingual_sentiment_model/en/saved_model'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 96), dtype=tf.int32, name='keras_tensor_74')
Output Type:
  TensorSpec(shape=(None, 3), dtype=tf.float32, name=None)
Captures:
  6327241184: TensorSpec(shape=(), dtype=tf.resource, name=None)
  6326034928: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13247643344: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13247642464: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13247627504: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13247638944: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13247635424: TensorSpec(shape=(), dtype=tf.resource, name=None)
  6139727184: TensorSpec(shape=(), dtype=tf.resource, name=None)
  6398892288: TensorSpec(shape=(), dtype=tf.resource, name=None)
  6398891584: TensorSpec(shape=(), dtype=tf.resource, name=None)
  6398894752: TensorSpec(shape=(), dtype=tf.r

## 7) اختبار سريع للتوقع + موجه توجيه (Router)

In [50]:
def load_assets_for_infer(lang_dir: Path):
    # tokenizer
    with open(lang_dir / 'tokenizer.json', 'r', encoding='utf-8') as f:
        tok = tokenizer_from_json(f.read())
    # label map
    with open(lang_dir / 'label_map.json', 'r', encoding='utf-8') as f:
        classes = json.load(f)['classes']
    # model
    # جرّب .keras ثم .h5 ثم SavedModel
    for cand in [lang_dir / f"{lang_dir.name}_best.keras", lang_dir / f"{lang_dir.name}_best.h5", lang_dir / 'saved_model']:
        if cand.exists():
            model = tf.keras.models.load_model(cand, compile=False)
            return tok, classes, model
    raise FileNotFoundError(f"No model found under {lang_dir}")

def infer_one(text, model_root='bilingual_sentiment_model'):
    text = str(text)
    lang = 'ar' if AR_RE.search(text) else 'en'
    # normalize arabic
    if lang=='ar':
        text = ar_normalize(text)
    tok, classes, model = load_assets_for_infer(Path(model_root)/lang)
    X = pad_sequences(tok.texts_to_sequences([text]), maxlen=MAX_LEN, padding='post', truncating='post')
    probs = model.predict(X, verbose=0)[0]
    idx = int(np.argmax(probs))
    return {'lang': lang, 'label': classes[idx], 'confidence': float(probs[idx])}

# تجربة
try:
    print(infer_one("انا مبسوط اليوم"))
    print(infer_one("I hate this product"))
except Exception as e:
    print("Inference skipped:", e)

{'lang': 'ar', 'label': 'neutral', 'confidence': 0.6686696410179138}
{'lang': 'en', 'label': 'negative', 'confidence': 0.8545156717300415}


In [None]:
! streamlit run app.py