In [None]:
!pip install sentence-transformers optuna #optuna used for automatic greed search for LightGBM modle

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util, CrossEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import optuna

In [None]:
# 1. Load data
df = pd.read_csv('synthetic_name_variations_1500_1.csv')
# Ensure it has: 'name_input','variation','label' (0 or 1)

# 2. (Optional) Basic text normalization
# For Arabic you might strip tashkeel (diacritics) etc., but here we skip

# 3. Load one multilingual embedding model
#model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
bi_encoder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')  # For embeddings
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')  # For similarity scoring

In [None]:
#dataset will be composed of embeddings of input_name and variant , semantic similarity calculated with the crossEncoder , cosine / jaccard /manhattan metrics
# 4. Generate Features
def generate_features(names, variations):
    # Embeddings
    emb_names = bi_encoder.encode(names, show_progress_bar=True)
    emb_vars = bi_encoder.encode(variations, show_progress_bar=True)

    # Similarity metrics
    cos_sim = util.cos_sim(emb_names, emb_vars).numpy().diagonal()
    manhattan = np.sum(np.abs(emb_names - emb_vars), axis=1)
    jaccard = [len(set(n.split()) & set(v.split())) / max(1, len(set(n.split()) | set(v.split())))
              for n, v in zip(names, variations)]

    # Cross-Encoder scores
    cross_scores = cross_encoder.predict(list(zip(names, variations)))

    return np.column_stack([emb_names, emb_vars, cos_sim, manhattan, jaccard, cross_scores])

X = generate_features(df['input_name'].tolist(), df['variation'].tolist())
y = df['label'].values

In [None]:
# 5. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import lightgbm as lgb
import optuna

def objective(trial):
    # Suggest hyperparameters
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
    }

    # Train LightGBM
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)]

    )

    # Evaluate
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# Best hyperparameters
print("Best trial:")
trial = study.best_trial
print(f"  Accuracy: {trial.value:.4f}")
print("  Params: ", trial.params)

In [None]:
# Initialize with best params
best_params = study.best_params
best_params['objective'] = 'binary'  # Ensure this is set

final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],

)

# Evaluate
y_pred = final_model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix # Import confusion_matrix directly
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()