# Bilingual (Arabic + English) Sentiment — Keras Baseline

This notebook trains **two separate models**:
- **Arabic model**: trained on `train_all_ext.csv` + `train_all.csv` (columns: `Text`, `sentiment`).
- **English model**: trained on `train.csv` (columns: `text`, `sentiment`, with `latin-1` encoding).

Each model is saved in **both** `.keras` and `.h5` formats, and we export `tokenizer.json`, `label_map.json`, and metrics.

At the end, an **inference router** auto-detects language and uses the correct model.


In [1]:
# Optional: install dependencies (uncomment if needed)
#!pip install tensorflow==2.15.0 keras==2.15.0 pandas numpy
# For Apple Silicon:
# !pip install tensorflow-macos==2.15.0

In [3]:
import os, json, re, csv, warnings
import numpy as np
import pandas as pd
from pathlib import Path
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, GlobalMaxPool1D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

print(tf.__version__)



2.15.0


## Paths & Hyperparameters

In [6]:
# Adjust paths if your files are elsewhere
AR_CSVS = [
    "train_all_ext.csv",  # (Text, sentiment)
    "train_all.csv",      # (Text, sentiment)
]
EN_CSV  = "train.csv"     # (text, sentiment) encoded latin-1

OUT_DIR    = "bilingual_sentiment_model"
EPOCHS     = 5
BATCH_SIZE = 64
MAX_WORDS  = 20000
MAX_LEN    = 96
SEED       = 42

Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

## Utilities: Loading, Cleaning, Tokenization, Model Builder

In [9]:
ARABIC_RE = re.compile(r'[\u0600-\u06FF]')

def read_csv_safe(path, encoding=None):
    encs = ([encoding] if encoding else []) + ["utf-8", "utf-8-sig", "latin-1", "cp1252", "windows-1252"]
    last_err = None
    for enc in encs:
        try:
            return pd.read_csv(path, encoding=enc)
        except Exception as e:
            last_err = e
    raise last_err

def load_arabic_df(csvs):
    frames = []
    for p in csvs:
        if not os.path.exists(p):
            print(f"[WARN] Arabic CSV not found: {p}")
            continue
        df = read_csv_safe(p)
        if 'Text' not in df.columns or 'sentiment' not in df.columns:
            print(f"[WARN] {p} missing required columns. Found: {list(df.columns)}")
            continue
        df = df[['Text','sentiment']].dropna()
        df = df[df['Text'].astype(str).str.contains(ARABIC_RE)]
        frames.append(df.rename(columns={'Text':'text'}))
    if not frames:
        raise ValueError("No valid Arabic frames loaded.")
    out = pd.concat(frames, ignore_index=True)
    out['lang'] = 'ar'
    return out

def load_english_df(csv_path):
    if not os.path.exists(csv_path):
        raise ValueError(f"English CSV not found: {csv_path}")
    df = read_csv_safe(csv_path, encoding='latin-1')
    if 'text' not in df.columns or 'sentiment' not in df.columns:
        raise ValueError(f"English CSV missing required cols. Found: {list(df.columns)}")
    df = df[['text','sentiment']].dropna()
    df = df[~df['text'].astype(str).str.contains(ARABIC_RE)]
    df['lang'] = 'en'
    return df

def clean_text_basic(s: str, lang: str):
    s = str(s)
    s = re.sub(r"http\S+|www\.\S+|@\w+|#\w+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    if lang == 'en':
        s = s.lower()
    return s

def prepare_xy(df, max_words, max_len):
    texts  = df['text'].astype(str).tolist()
    labels = df['sentiment'].astype(str).tolist()
    classes = sorted(list(dict.fromkeys(labels)))
    cls2idx = {c:i for i,c in enumerate(classes)}
    y = np.array([cls2idx[c] for c in labels])
    tok = Tokenizer(num_words=max_words, oov_token='<OOV>')
    tok.fit_on_texts(texts)
    X_seq = tok.texts_to_sequences(texts)
    X = pad_sequences(X_seq, maxlen=max_len, padding='post', truncating='post')
    return X, y, tok, classes

def build_model(vocab, max_len, num_classes):
    model = Sequential([
        Embedding(input_dim=vocab, output_dim=128, input_length=max_len),
        Bidirectional(LSTM(64, return_sequences=True)),
        GlobalMaxPool1D(),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

def train_one_lang(df, lang, out_dir, max_words, max_len, epochs, batch_size, seed=42):
    df = df.copy()
    df['text'] = df.apply(lambda r: clean_text_basic(r['text'], lang), axis=1)

    rng = np.random.default_rng(seed)
    idx = np.arange(len(df))
    rng.shuffle(idx)
    df = df.iloc[idx].reset_index(drop=True)

    n = len(df)
    n_tr = int(n*0.9)
    df_tr = df.iloc[:n_tr]
    df_va = df.iloc[n_tr:]

    X_tr, y_tr, tok, classes = prepare_xy(df_tr, max_words, max_len)
    X_va = pad_sequences(tok.texts_to_sequences(df_va['text'].astype(str).tolist()), maxlen=max_len, padding='post', truncating='post')
    y_va = np.array([classes.index(s) for s in df_va['sentiment'].astype(str).tolist()])

    vocab = min(max_words, len(tok.word_index)+1)
    model = build_model(vocab, max_len, num_classes=len(classes))

    lang_dir = Path(out_dir) / lang
    lang_dir.mkdir(parents=True, exist_ok=True)
    ckpt_path = str(lang_dir / f"{lang}_best.keras")
    ckpt = ModelCheckpoint(ckpt_path, monitor='val_accuracy', save_best_only=True, verbose=1)
    es = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True, verbose=1)

    hist = model.fit(
        X_tr, y_tr,
        validation_data=(X_va, y_va),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[ckpt, es],
        verbose=2
    )

    keras_path = lang_dir / f"{lang}_final.keras"
    h5_path    = lang_dir / f"{lang}_final.h5"
    model.save(keras_path)
    try:
        model.save(h5_path, save_format='h5')
    except Exception:
        model.save(h5_path)

    tok_json = tok.to_json()
    with open(lang_dir / 'tokenizer.json', 'w', encoding='utf-8') as f:
        f.write(tok_json)
    with open(lang_dir / 'label_map.json', 'w', encoding='utf-8') as f:
        json.dump({'classes': classes}, f, ensure_ascii=False, indent=2)

    hist_csv = lang_dir / 'history.csv'
    keys = sorted(hist.history.keys())
    with open(hist_csv, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['epoch'] + keys)
        for i in range(len(hist.history[keys[0]])):
            writer.writerow([i] + [hist.history[k][i] for k in keys])

    metrics = {
        'train_size': int(len(df_tr)),
        'val_size': int(len(df_va)),
        'best_val_acc': float(max(hist.history.get('val_accuracy', [0]))),
        'classes': classes,
    }
    with open(lang_dir / 'metrics.json', 'w', encoding='utf-8') as f:
        json.dump(metrics, f, ensure_ascii=False, indent=2)
    return metrics


## Load Data

In [12]:
print("=== Loading Arabic data ===")
ar_df = load_arabic_df(AR_CSVS)
print("Arabic rows:", len(ar_df))
display(ar_df.head())

print("=== Loading English data ===")
en_df = load_english_df(EN_CSV)
print("English rows:", len(en_df))
display(en_df.head())

=== Loading Arabic data ===
Arabic rows: 128606


Unnamed: 0,text,sentiment,lang
0,الزعل بيغير ملامحك بيغير نظرة العين بيغير شك...,neutral,ar
1,@halgawi @DmfMohe ليس حباً في ايران بقدر ماهو ...,neutral,ar
2,@adalfahadduwail أبي أعرف الحاكم العربي المسلم...,neutral,ar
3,@sarmadbouchamou @DimaSadek في الخطاب تبع سليم...,neutral,ar
4,@FofaMahmouddd مفيش الكلام ده في الزمن,neutral,ar


=== Loading English data ===
English rows: 27480


Unnamed: 0,text,sentiment,lang
0,"I`d have responded, if I were going",neutral,en
1,Sooo SAD I will miss you here in San Diego!!!,negative,en
2,my boss is bullying me...,negative,en
3,what interview! leave me alone,negative,en
4,"Sons of ****, why couldn`t they put them on t...",negative,en


## Train Arabic Model

In [26]:
ar_metrics = train_one_lang(
    ar_df, 'ar', OUT_DIR,
    max_words=MAX_WORDS, max_len=MAX_LEN,
    epochs=EPOCHS, batch_size=BATCH_SIZE, seed=SEED
)
ar_metrics

Epoch 1/5

Epoch 1: val_accuracy improved from -inf to 0.83664, saving model to bilingual_sentiment_model/ar/ar_best.keras
1809/1809 - 220s - loss: 0.5251 - accuracy: 0.7920 - val_loss: 0.4092 - val_accuracy: 0.8366 - 220s/epoch - 121ms/step
Epoch 2/5

Epoch 2: val_accuracy improved from 0.83664 to 0.87528, saving model to bilingual_sentiment_model/ar/ar_best.keras
1809/1809 - 217s - loss: 0.3007 - accuracy: 0.8864 - val_loss: 0.3360 - val_accuracy: 0.8753 - 217s/epoch - 120ms/step
Epoch 3/5

Epoch 3: val_accuracy improved from 0.87528 to 0.89931, saving model to bilingual_sentiment_model/ar/ar_best.keras
1809/1809 - 216s - loss: 0.1769 - accuracy: 0.9361 - val_loss: 0.3133 - val_accuracy: 0.8993 - 216s/epoch - 120ms/step
Epoch 4/5

Epoch 4: val_accuracy improved from 0.89931 to 0.91836, saving model to bilingual_sentiment_model/ar/ar_best.keras
1809/1809 - 307s - loss: 0.1041 - accuracy: 0.9630 - val_loss: 0.2948 - val_accuracy: 0.9184 - 307s/epoch - 170ms/step
Epoch 5/5

Epoch 5: val

  saving_api.save_model(


{'train_size': 115745,
 'val_size': 12861,
 'best_val_acc': 0.9294767379760742,
 'classes': ['negative', 'neutral', 'positive']}

## Train English Model

In [27]:
en_metrics = train_one_lang(
    en_df, 'en', OUT_DIR,
    max_words=MAX_WORDS, max_len=MAX_LEN,
    epochs=EPOCHS, batch_size=BATCH_SIZE, seed=SEED
)
en_metrics

Epoch 1/5

Epoch 1: val_accuracy improved from -inf to 0.72817, saving model to bilingual_sentiment_model/en/en_best.keras
387/387 - 74s - loss: 0.8177 - accuracy: 0.6230 - val_loss: 0.6552 - val_accuracy: 0.7282 - 74s/epoch - 192ms/step
Epoch 2/5

Epoch 2: val_accuracy did not improve from 0.72817
387/387 - 77s - loss: 0.5537 - accuracy: 0.7780 - val_loss: 0.6685 - val_accuracy: 0.7202 - 77s/epoch - 199ms/step
Epoch 3/5

Epoch 3: val_accuracy did not improve from 0.72817
387/387 - 74s - loss: 0.4081 - accuracy: 0.8472 - val_loss: 0.7641 - val_accuracy: 0.7194 - 74s/epoch - 191ms/step
Epoch 4/5

Epoch 4: val_accuracy did not improve from 0.72817
Restoring model weights from the end of the best epoch: 1.
387/387 - 79s - loss: 0.2997 - accuracy: 0.8941 - val_loss: 0.8294 - val_accuracy: 0.7107 - 79s/epoch - 204ms/step
Epoch 4: early stopping


  saving_api.save_model(


{'train_size': 24732,
 'val_size': 2748,
 'best_val_acc': 0.7281659245491028,
 'classes': ['negative', 'neutral', 'positive']}

## Save Top-Level Summary

In [14]:
summary = {'ar': ar_metrics, 'en': en_metrics}
with open(Path(OUT_DIR)/'summary.json', 'w', encoding='utf-8') as f:
    json.dump(summary, f, ensure_ascii=False, indent=2)
summary

NameError: name 'ar_metrics' is not defined

## Inference Router (Auto Language Detect)

In [31]:
from tensorflow.keras.preprocessing.text import tokenizer_from_json

def detect_language_simple(text: str) -> str:
    return 'ar' if ARABIC_RE.search(str(text)) else 'en'

def load_lang_assets(model_dir, lang, max_len=96):
    lang_dir = Path(model_dir) / lang
    with open(lang_dir / 'tokenizer.json', 'r', encoding='utf-8') as f:
        tok = tokenizer_from_json(f.read())
    with open(lang_dir / 'label_map.json', 'r', encoding='utf-8') as f:
        classes = json.load(f)['classes']
    model_path = lang_dir / f"{lang}_best.keras"
    if not model_path.exists():
        model_path = lang_dir / f"{lang}_final.keras"
    model = tf.keras.models.load_model(model_path)
    return tok, classes, model, max_len

def predict_texts(texts, model_dir=OUT_DIR):
    groups = {'ar': [], 'en': []}
    idxs = {'ar': [], 'en': []}
    for i, t in enumerate(texts):
        lang = detect_language_simple(t)
        groups[lang].append(t)
        idxs[lang].append(i)
    outputs = [None]*len(texts)
    for lang in ['ar','en']:
        if not groups[lang]:
            continue
        tok, classes, model, max_len = load_lang_assets(model_dir, lang)
        seqs = tok.texts_to_sequences(groups[lang])
        X = pad_sequences(seqs, maxlen=max_len, padding='post', truncating='post')
        probs = model.predict(X, verbose=0)
        preds = probs.argmax(axis=1)
        for j, i_orig in enumerate(idxs[lang]):
            outputs[i_orig] = (lang, classes[preds[j]], float(probs[j][preds[j]]))
    return outputs

# Demo (run after training):
examples = ["I love this course!", "اليوم جميل", "This is terrible", "مرة مبسوط"]
try:
    print(predict_texts(examples))
except Exception as e:
    print("(Run after training):", e)

[('en', 'positive', 0.9262523651123047), ('ar', 'positive', 0.786483645439148), ('en', 'negative', 0.8367998600006104), ('ar', 'neutral', 0.9998291730880737)]


In [None]:
! streamlit run app12.py

[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8502[0m
[34m  Network URL: [0m[1mhttp://192.168.7.94:8502[0m
[0m
[34m[1m  For better performance, install the Watchdog module:[0m

  $ xcode-select --install
  $ pip install watchdog
            [0m
[mutex.cc : 452] RAW: Lock blocking 0x600001cdcbb8   @
