In [None]:
import spacy
import pandas as pd
import re
import joblib
import pickle
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score



df = pd.read_csv('urdu_to_roman_urdu_full_dataset.csv')

def clean_text(text):
    text = str(text)
    text = re.sub(r"[^\u0600-\u06FF]", "", text)  # keep only Arabic/Urdu
    return text.strip()

df['Arabic'] = df['Arabic'].apply(clean_text)
df['Urdu'] = df['Urdu'].apply(clean_text)

df = df[df['Arabic'].str.len() > 2]

nlp_ar = spacy.blank('ar')

def ar_tokenization(text):
    text = str(text).strip()
    doc = nlp_ar(text)
    return [token.text for token in doc if not token.is_stop and not token.is_punct]

nlp_ur = spacy.blank('ur')

def ur_tokenization(text):
    text = str(text).strip()
    doc = nlp_ur(text)
    return [token.text for token in doc if not token.is_stop and not token.is_punct]

nlp_ar = spacy.blank('en')

def sr_tokenization(text):
    text = str(text).strip()
    doc = nlp_ar(text)
    return [token.text for token in doc if not token.is_stop and not token.is_punct]

tqdm.pandas()

df['Arabic'] = df['Arabic'].progress_apply(lambda x: ' '.join(ar_tokenization(x)))
df['Urdu'] = df['Urdu'].progress_apply(lambda x: ' '.join(ur_tokenization(x)))

df = df.dropna(subset=["Arabic", "Urdu"])

X_raw = df['Arabic'].astype(str).tolist()
y = df['Urdu'].astype(str).tolist()

vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
X = vectorizer.fit_transform(X_raw)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)

clf = KNeighborsClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

test_words = ['أينالمسجد']

for word in test_words:
    vec = vectorizer.transform([word])
    urdu = clf.predict(vec)[0]
    print(f"Arabic: {word} ➝ Urdu: {urdu}")

joblib.dump(clf, 'arabic_to_urdu_model.pkl')
joblib.dump(vectorizer, 'arabic_vectorizer.pkl')

df['Urdu'] = df['Urdu'].astype(str).str.strip()
df['Roman Urdu'] = df['Roman Urdu'].astype(str).str.strip()

translit_dict = dict(zip(df['Urdu'], df['Roman Urdu']))

def transliterate_urdu(word):
    return translit_dict.get(word.strip(), "؟")

test_words = ['مسجدکہاںہے']

for w in test_words:
    print(f"Urdu: {w} ➝ Roman Urdu: {transliterate_urdu(w)}")

with open("urdu_roman_translit.pkl", "wb") as f:
    pickle.dump(translit_dict, f)

In [None]:
def transliterate_urdu(word):
    return translit_dict.get(word.strip(), "؟")

while True:
    user_input = input("👤 You (Arabic word): ").strip()

    if user_input.lower() in ["exit", "quit", "خروج"]:
        print("👋 Goodbye!")
        break

    try:
        vec = vectorizer.transform([user_input])
        urdu_word = clf.predict(vec)[0]
    except Exception as e:
        urdu_word = "؟"
        print("⚠️ Could not translate:", e)

    roman_word = transliterate_urdu(urdu_word)

    print("🧠 Urdu Translation:", urdu_word)
    print("🔤 Roman Transliteration:", roman_word)
