Data augmentation

In [1]:
!pip install transformers sentencepiece torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import pandas as pd
import csv
from transformers import MarianMTModel, MarianTokenizer
import re

# Function to detect if text is primarily Arabic
def is_arabic(text):
    # Arabic Unicode range (simplified check)
    arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
    arabic_chars = len(arabic_pattern.findall(text))
    total_chars = len(text.strip())

    # If more than 30% of characters are Arabic, classify as Arabic
    if total_chars > 0 and arabic_chars / total_chars > 0.3:
        return True
    return False

# Load translation models
# For French to Arabic
fr_ar_model_name = "Helsinki-NLP/opus-mt-fr-ar"
fr_ar_tokenizer = MarianTokenizer.from_pretrained(fr_ar_model_name)
fr_ar_model = MarianMTModel.from_pretrained(fr_ar_model_name)

# For Arabic to French
ar_fr_model_name = "Helsinki-NLP/opus-mt-ar-fr"
ar_fr_tokenizer = MarianTokenizer.from_pretrained(ar_fr_model_name)
ar_fr_model = MarianMTModel.from_pretrained(ar_fr_model_name)

# Translation function
def translate_text(text, source_lang, target_lang):
    if source_lang == "fr" and target_lang == "ar":
        tokenizer = fr_ar_tokenizer
        model = fr_ar_model
    else:  # ar to fr
        tokenizer = ar_fr_tokenizer
        model = ar_fr_model

    # Skip empty text
    if not text or text.strip() == "":
        return ""

    try:
        # Tokenize and translate
        inputs = tokenizer(text, return_tensors="pt", padding=True)
        output = model.generate(**inputs)
        translation = tokenizer.decode(output[0], skip_special_tokens=True)
        return translation
    except Exception as e:
        print(f"Translation error: {e} for text: {text}")
        return text  # Return original text if translation fails

# Main data augmentation function
def augment_data(input_file, output_file):
    try:
        # Read the input CSV with Windows-1252 encoding
        df = pd.read_csv(input_file, encoding='utf-8') #cp1252

        # Create a copy for the augmented rows
        translated_df = df.copy()

        # Process the third column (assuming 0-based indexing)
        third_col_name = df.columns[3]

        # Translate based on language detection
        for idx, row in translated_df.iterrows():
            text = str(row[third_col_name])
            if is_arabic(text):
                translated_df.at[idx, third_col_name] = translate_text(text, "ar", "fr")
            else:
                translated_df.at[idx, third_col_name] = translate_text(text, "fr", "ar")

        # Combine original and translated dataframes
        result_df = pd.concat([df, translated_df], ignore_index=True)

        # Save to output file with UTF-8 encoding
        result_df.to_csv(output_file, index=False, encoding='utf-8')

        print(f"Augmentation complete. Results saved to {output_file}")
        print(f"Original rows: {len(df)}, New total: {len(result_df)}")

    except Exception as e:
        print(f"Error during data augmentation: {e}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/827k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/925k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.19M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/918k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/824k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.23M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/311M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/311M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [2]:
import pandas as pd
import csv
from transformers import MarianMTModel, MarianTokenizer
import re

# Improved function to detect if text is primarily Arabic
def is_arabic(text):
    # More comprehensive Arabic Unicode range
    arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]+')

    # Count Arabic characters
    arabic_chars = sum(len(match) for match in arabic_pattern.findall(text))
    total_chars = len(text.strip())

    # Debug output
    #print(f"Text: {text[:30]}... | Arabic chars: {arabic_chars}/{total_chars}")

    # If more than 30% of characters are Arabic, classify as Arabic
    if total_chars > 0 and arabic_chars / total_chars > 0.3:
        return True
    return False

# Load translation models
#print("Loading translation models...")

# For French to Arabic
fr_ar_model_name = "Helsinki-NLP/opus-mt-fr-ar"
fr_ar_tokenizer = MarianTokenizer.from_pretrained(fr_ar_model_name)
fr_ar_model = MarianMTModel.from_pretrained(fr_ar_model_name)
#print("Loaded French → Arabic model")

# For Arabic to French
ar_fr_model_name = "Helsinki-NLP/opus-mt-ar-fr"
ar_fr_tokenizer = MarianTokenizer.from_pretrained(ar_fr_model_name)
ar_fr_model = MarianMTModel.from_pretrained(ar_fr_model_name)
#print("Loaded Arabic → French model")

# Improved translation function with debugging
def translate_text(text, source_lang, target_lang):
    print(f"Translating from {source_lang} to {target_lang}: {text[:30]}...")

    if source_lang == "fr" and target_lang == "ar":
        tokenizer = fr_ar_tokenizer
        model = fr_ar_model
        #print("Using French → Arabic model")
    else:  # ar to fr
        tokenizer = ar_fr_tokenizer
        model = ar_fr_model
        #print("Using Arabic → French model")

    # Skip empty text
    if not text or text.strip() == "":
        return ""

    try:
        # Tokenize and translate
        inputs = tokenizer(text, return_tensors="pt", padding=True)
        output = model.generate(**inputs)
        translation = tokenizer.decode(output[0], skip_special_tokens=True)
        #print(f"Translation result: {translation[:30]}...")
        return translation
    except Exception as e:
        #print(f"Translation error: {e} for text: {text}")
        return text  # Return original text if translation fails

# Main data augmentation function
def augment_data(input_file, output_file):
    try:
        # Read the input CSV
        try:
            df = pd.read_csv(input_file, encoding='utf-8')
            #print(f"Successfully read file using UTF-8 encoding")
        except UnicodeDecodeError:
            df = pd.read_csv(input_file, encoding='cp1252')
            #print(f"Successfully read file using Windows-1252 encoding")

        # Create a copy for the augmented rows
        translated_df = df.copy()

        # Process the third column (assuming 0-based indexing, index 3 is the 4th column)
        third_col_name = df.columns[3]
        print(f"Translating column: {third_col_name}")

        # Check first few values to confirm
        #print("\nSample values from column to be translated:")
        #for i, val in enumerate(df[third_col_name].head().values):
         #   print(f"  Row {i+1}: {val}")
        #print()

        # Translate based on language detection with explicit debug output
        for idx, row in translated_df.iterrows():
            text = str(row[third_col_name])
            is_arabic_text = is_arabic(text)

            if is_arabic_text:
                #print(f"Row {idx+1}: Detected ARABIC text")
                translated_df.at[idx, third_col_name] = translate_text(text, "ar", "fr")
            else:
                #print(f"Row {idx+1}: Detected NON-ARABIC text (assuming French)")
                translated_df.at[idx, third_col_name] = translate_text(text, "fr", "ar")

        # Combine original and translated dataframes
        result_df = pd.concat([df, translated_df], ignore_index=True)

        # Save to output file with UTF-8 encoding
        result_df.to_csv(output_file, index=False, encoding='utf-8')

        print(f"\nAugmentation complete. Results saved to {output_file}")
        print(f"Original rows: {len(df)}, New total: {len(result_df)}")

    except Exception as e:
        print(f"Error during data augmentation: {e}")
        import traceback
        traceback.print_exc()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/827k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/925k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.19M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/307M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/918k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/824k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.23M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/311M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/311M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [3]:
# Replace with your input and output file paths
input_file = "opportunités.csv"
output_file = "augmented_Opportunités.csv"

augment_data(input_file, output_file)

Translating column: Intitulé du projet
Translating from fr to ar: Refonte du système d’informati...
Translating from fr to ar: création d'un site web au prof...
Translating from fr to ar: conception,développemnt et aud...
Translating from fr to ar: acquisition d'un logiciel pour...
Translating from fr to ar: acquisition d'un logicielle po...
Translating from fr to ar: mise en place d'un portail num...
Translating from fr to ar: conception développement héber...
Translating from fr to ar: mise en place d'un progiciel d...
Translating from ar to fr: التزود بمواد و معدات تنظيف و ت...
Translating from fr to ar: Acquisition Logiciel de suivi ...
Translating from fr to ar: acquisition ou le développemen...
Translating from fr to ar: Travaux de raccordement électr...
Translating from fr to ar: Mise en place d'un portail de ...
Translating from fr to ar: Mettre en place une plateforme...
Translating from fr to ar: La fourniture de services de d...
Translating from fr to ar: ENTRETIEN DU POSTE 

--------------------------------------
Method 1: **mBert**,multi-language Bert, to use for both french and arabic.

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
import numpy as np

# Load the mBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')
classifier = None

def extract_embeddings(texts):
    """Extract embeddings from text using mBERT."""
    embeddings = []

    for text in texts:
        if isinstance(text, str):
            # Tokenize text
            inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)

            # Get embeddings
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the [CLS] token embedding as the representation of the text
            embedding = outputs.last_hidden_state[:, 0, :].numpy()[0]
            embeddings.append(embedding)
        else:
            # Handle NaN or non-string values
            embeddings.append(np.zeros(768))

    return np.array(embeddings)

def train(train_csv, text_column="Intitulé du projet", label_column="Selection"):
    """Train the classifier on the provided dataset."""
    global classifier

    # Load data
    data = pd.read_csv(train_csv)

    # Print columns for debugging
    print(f"Available columns: {data.columns.tolist()}")

    # Check if the required columns exist
    if text_column not in data.columns or label_column not in data.columns:
        raise ValueError(f"Required columns not found. Available columns: {data.columns.tolist()}")

    # Extract texts and labels
    texts = data[text_column].fillna("").tolist()

    # Convert labels to binary format if needed
    if data[label_column].dtype == object:
        # If labels are text-based (like 'yes'/'no' or 'oui'/'non')
        label_map = {'yes': 1, 'oui': 1, 'Yes': 1, 'Oui': 1, 'Y': 1, 'O': 1, 'true': 1, 'True': 1,
                    'no': 0, 'non': 0, 'No': 0, 'Non': 0, 'N': 0, 'false': 0, 'False': 0}
        labels = [label_map.get(str(label).strip(), 0) for label in data[label_column]]
    else:
        # If labels are already numeric
        labels = data[label_column].fillna(0).tolist()

    # Ensure labels are binary (0 or 1)
    labels = [1 if label > 0 else 0 for label in labels]

    # Extract embeddings
    X = extract_embeddings(texts)
    y = np.array(labels)

    # Split into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Print class distribution
    print(f"Training set: {sum(y_train)} positive, {len(y_train) - sum(y_train)} negative")
    print(f"Validation set: {sum(y_val)} positive, {len(y_val) - sum(y_val)} negative")

    # Train a logistic regression classifier
    #classifier = LogisticRegression(max_iter=1000, C=1.0)
    classifier = LogisticRegression(max_iter=1000, C=0.1)
    classifier.fit(X_train, y_train)

    # Evaluate on validation set
    y_pred = classifier.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    report = classification_report(y_val, y_pred)

    print(f"Validation Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)

    return accuracy

def predict(text):
    """Predict whether a single opportunity text is relevant (1) or not (0)."""
    if classifier is None:
        raise ValueError("Model not trained. Please train the model first.")

    embedding = extract_embeddings([text])
    prediction = classifier.predict(embedding)[0]
    probability = classifier.predict_proba(embedding)[0][1]

    return prediction, probability

def filter_csv(input_csv, output_csv, text_column="Intitulé du projet", threshold=0.5):
    """Filter opportunities from a CSV file and save the results."""
    if classifier is None:
        raise ValueError("Model not trained. Please train the model first.")

    # Load data
    data = pd.read_csv(input_csv)

    # Check if the required column exists
    if text_column not in data.columns:
        raise ValueError(f"Required column '{text_column}' not found. Available columns: {data.columns.tolist()}")

    # Extract texts
    texts = data[text_column].fillna("").tolist()

    # Extract embeddings
    X = extract_embeddings(texts)

    # Predict
    predictions = classifier.predict(X)
    probabilities = classifier.predict_proba(X)[:, 1]

    # Add predictions to the DataFrame
    data['prediction'] = predictions
    data['confidence'] = probabilities

    # Filter based on threshold
    filtered_data = data[data['confidence'] >= threshold]

    # Save the results
    filtered_data.to_csv(output_csv, index=False)

    print(f"Total opportunities: {len(data)}")
    print(f"Filtered opportunities: {len(filtered_data)}")
    print(f"Filter rate: {len(filtered_data) / len(data) * 100:.2f}%")

    return filtered_data

# Example usage:
# 1. Train the model with correct column names
# train('opportunités.csv', text_column="Intitulé du projet", label_column="Selection")
#
# 2. Filter new opportunities
# filter_csv('new_opportunities.csv', 'filtered_results.csv', text_column="Intitulé du projet")

In [None]:
def train(train_csv, text_column="Intitulé du projet", label_column="Selection"):
    global classifier

    # Load data
    data = pd.read_csv(train_csv)
    texts = data[text_column].fillna("").tolist()
    labels = data[label_column].fillna(0).astype(int).tolist()

    # Extract embeddings
    X = extract_embeddings(texts)
    y = np.array(labels)

    # Split into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train classifier
    classifier = LogisticRegression(max_iter=1000, C=1.0)
    classifier.fit(X_train, y_train)

    return X, y, X_train, X_val, y_train, y_val  # Return values for debugging
X, y, X_train, X_val, y_train, y_val = train('augmented_Opportunités.csv')


In [None]:
print(f"Label Distribution: {sum(y)} positive, {len(y) - sum(y)} negative")

Label Distribution: 90 positive, 18 negative


In [None]:
df_embeddings = pd.DataFrame(X)
df_embeddings['label'] = y
print(df_embeddings.groupby('label').mean())  # Check if there's a clear difference

            0         1         2         3         4         5         6    \
label                                                                         
0      0.076377 -0.050751  0.198102 -0.006982  0.018746  0.007492  0.007923   
1      0.082631 -0.188779  0.165558 -0.017263 -0.079743  0.072469 -0.029041   

            7         8         9    ...       758       759       760  \
label                                ...                                 
0      0.107654 -0.083134  0.166931  ...  0.025959 -0.204115 -0.290917   
1      0.111151 -0.183841  0.099543  ... -0.036552 -0.219650 -0.289929   

            761       762       763       764       765       766       767  
label                                                                        
0     -0.218194  0.101170  0.280932  0.070906  0.153270  0.069991 -0.088098  
1     -0.255595  0.098556  0.305426  0.172300  0.171993  0.181057 -0.139834  

[2 rows x 768 columns]


In [None]:
train_acc = classifier.score(X_train, y_train)
val_acc = classifier.score(X_val, y_val)
print(f"Train Accuracy: {train_acc:.4f}, Validation Accuracy: {val_acc:.4f}")

Train Accuracy: 1.0000, Validation Accuracy: 0.9545


In [None]:
for text, true, pred in zip(texts, y_val, y_pred):
    if true != pred:
        print(f"Text: {text}\nTrue: {true}, Predicted: {pred}\n")

NameError: name 'texts' is not defined

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load CSV file
df = pd.read_csv("augmented_Opportunités.csv")

# Split into train and test (80% training, 20% testing)
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

# Save the split datasets
train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)

print("Training and testing datasets saved.")

Training and testing datasets saved.


In [None]:
train('augmented_Opportunités.csv', text_column="Intitulé du projet", label_column="Selection")

(array([[ 0.18011768,  0.07875498, -0.06401318, ...,  0.18576127,
          0.18557368,  0.11767497],
        [ 0.158455  , -0.04974524,  0.0615683 , ...,  0.05951943,
          0.09819075, -0.076176  ],
        [ 0.1451984 , -0.19096574,  0.20086794, ...,  0.13962953,
          0.10186116, -0.01705173],
        ...,
        [ 0.23355992, -0.13148242,  0.45955503, ...,  0.31312042,
          0.32187784, -0.07692916],
        [ 0.06846351, -0.3154884 ,  0.5745339 , ...,  0.07702123,
         -0.05938891, -0.06102293],
        [-0.04131151, -0.00482536,  0.4159309 , ...,  0.19651444,
          0.38018808, -0.07982599]], dtype=float32),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
filter_csv('Tuneps (1).csv', 'filtered_results.csv', text_column="Intitulé du projet")

Total opportunities: 78
Filtered opportunities: 43
Filter rate: 55.13%


Unnamed: 0,N° consultation,Client,Date Publication,Intitulé du projet,Date Expiration,epBidMasterId,info,Lien,prediction,confidence
1,20250203002,Commissariat Régional au Développement Agricol...,27/02/2025,"Fourniture, transport et installation des syst...",26/03/2025 10:00,82180,info,https://www.tuneps.tn/portail/offres/details/8...,1,0.9651
2,20250202999,Commisariat régional de l’éducation de Sfax 1,27/02/2025,قسط عدد 1 :بناء قاعة إختصاص معهد بئر علي 2\nقس...,04/04/2025 10:00,82176,info,https://www.tuneps.tn/portail/offres/details/8...,1,0.979677
4,20250202997,Commune de Sousse,27/02/2025,"Projet de réhabilitation de lycée l'annexe ""Be...",03/04/2025 14:00,82172,info,https://www.tuneps.tn/portail/offres/details/8...,1,0.947513
5,20250202998,Hopital régional de béja,27/02/2025,التزود بمواد و معدات تنظيف و تعقيم لفائدة المس...,03/04/2025 10:00,82174,info,https://www.tuneps.tn/portail/offres/details/8...,1,0.726756
6,20250202996,"Direction Régionale de l’Equipement, de l’Habi...",27/02/2025,أشغـــــال التغليف السطـحــي للطــرقــات المرق...,31/03/2025 11:00,82171,info,https://www.tuneps.tn/portail/offres/details/8...,1,0.620558
8,20250202994,Conseil régional de siliana,27/02/2025,أشغال تهيئة الملعب البلدي بسيدي بورويس- قسط بن...,25/03/2025 10:00,82165,info,https://www.tuneps.tn/portail/offres/details/8...,1,0.603709
9,20250202992,Société Nationale d’Exploitation et de Distrib...,27/02/2025,AO 10-2025 Fourniture de chlore liquide en ren...,04/04/2025 09:00,82162,info,https://www.tuneps.tn/portail/offres/details/8...,1,0.62465
10,20250202991,Société Nationale d’Exploitation et de Distrib...,27/02/2025,AO 12-2025 Passation d’un marché cadre pour l’...,04/04/2025 09:00,82164,info,https://www.tuneps.tn/portail/offres/details/8...,1,0.822086
11,20250202990,"Direction Régionale de l’Equipement, de l’Habi...",27/02/2025,Rechargement et Revêtement des Accotements su...,28/03/2025 10:00,82156,info,https://www.tuneps.tn/portail/offres/details/8...,1,0.958596
12,S20250205543,Agence Tunisienne de la Formation Professionnelle,27/02/2025,Consultation N°05/2025 de services de nettoyag...,18/03/2025 10:00,211367,info,https://www.tuneps.tn/portail/consultations/co...,1,0.788969


------------------------
Finetuning Mbert

In [None]:
!pip install transformers datasets torch scikit-learn

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
import numpy as np

# Load data
df = pd.read_csv("augmented_Opportunités.csv")  # Your training CSV
df = df[['Intitulé du projet', 'Selection']].dropna()
df.columns = ['text', 'label']

# Convert labels to int (make sure 0=non-informatique, 1=informatique/dev)
df['label'] = df['label'].apply(lambda x: 1 if int(x) == 1 else 0)

# Split
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'].tolist(), df['label'].tolist(), test_size=0.2)

# Tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

# Tokenization function
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels}).map(tokenize, batched=True)
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels}).map(tokenize, batched=True)

# Set format for PyTorch
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mraoudha-hajji[0m ([33mraoudha-hajji-science-m-ta[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.578222
2,No log,0.521886
3,No log,0.469371
4,No log,0.47911


TrainOutput(global_step=44, training_loss=0.38980128548362036, metrics={'train_runtime': 1823.0166, 'train_samples_per_second': 0.189, 'train_steps_per_second': 0.024, 'total_flos': 22627550760960.0, 'train_loss': 0.38980128548362036, 'epoch': 4.0})

In [None]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    pred_label = torch.argmax(probs, dim=1).item()
    confidence = probs[0][1].item()
    return pred_label, confidence

# Example
text = "FOURNITURE DE TICKETS RESTAURANTS POUR LE PERSONNEL DE L’OACA"
pred, conf = predict(text)
print("Prediction:", pred, "| Confidence:", conf)

Prediction: 1 | Confidence: 0.8261643648147583


this model is having trouble seeing the sentence as a whole and thus labeling unrelated IT projects as 1. ^^^^

-------
method 2:  Sentence-Bert

In [4]:
!pip install sentence-transformers



In [34]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import re

# Load multilingual Sentence-BERT model
sbert_model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

def extract_sbert_embeddings(texts):
    """Extract SBERT embeddings (768-d) from a list of texts."""
    return sbert_model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

def normalize_text(text):
    """Normalize text by removing unnecessary characters."""
    text = re.sub(r"[^\u0600-\u06FFa-zA-Z0-9\s.,:/\-()]", "", text)
    text = text.lower()
    return text

def train_with_sbert(train_csv, text_column="Intitulé du projet", label_column="Selection"):
    global classifier

    data = pd.read_csv(train_csv)
    if text_column not in data.columns or label_column not in data.columns:
        raise ValueError(f"Required columns not found in {data.columns.tolist()}")

    texts = data[text_column].fillna("").tolist()

    # Convert labels (already done well in your version)
    labels = data[label_column].fillna(0)
    labels = [1 if label > 0 else 0 for label in labels]

    # Use SBERT
    X = extract_sbert_embeddings(texts)
    y = np.array(labels)

    # Train-test split
    from sklearn.model_selection import train_test_split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train classifier
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(max_iter=1000)
    classifier.fit(X_train, y_train)

    # Eval
    train_acc = classifier.score(X_train, y_train)
    val_acc = classifier.score(X_val, y_val)
    print(f"Train Accuracy: {train_acc:.4f}, Validation Accuracy: {val_acc:.4f}")

    from sklearn.metrics import classification_report
    y_pred = classifier.predict(X_val)
    print(classification_report(y_val, y_pred))

    return val_acc

def predict(text):
    embedding = extract_sbert_embeddings([text])
    prediction = classifier.predict(embedding)[0]
    confidence = classifier.predict_proba(embedding)[0][1]
    return prediction, confidence

def filter_csv(input_csv, output_csv, text_column="Intitulé du projet", threshold=0.6):
    """Filter opportunities from a CSV file and save the results."""
    if classifier is None:
        raise ValueError("Model not trained. Please train the model first.")

    # Load data
    data = pd.read_csv(input_csv)

    # Check if the required column exists
    if text_column not in data.columns:
        raise ValueError(f"Required column '{text_column}' not found. Available columns: {data.columns.tolist()}")

    # Extract texts
    texts = data[text_column].fillna("").tolist()

    # Apply normalization
    normalized_texts = [normalize_text(text) for text in texts]

    # Integrate the Prompt
    prompt = """
    Classify the following project as related to computer science, software development, or technology.
    Do not exclude projects that may not explicitly use terms like 'coding,' 'programming,' or 'software development,'
    but still involve tasks related to technology, digital systems, machine learning, AI, or IT infrastructure.
    Keep in mind that projects related to coding or technical development might not always use the exact keyword.
    Ensure that any project with a technical focus is not mistakenly labeled as unrelated.
    """

    # Filter based on the guidance in the prompt
    # Here we simulate the model applying the context from the prompt:
    relevant_projects = []
    for text in texts:
        # Apply your logic here to make the prediction more inclusive
        # For example, if the text contains certain key technology terms, retain it
        # This can be customized based on the prompt’s focus
        if any(keyword in text.lower() for keyword in ['web', 'logiciel', 'plateforme', 'application', "système d'information"]):
            relevant_projects.append(text)
        else:
            # Otherwise, pass through the normal filtering logic based on SBERT model predictions
            normalized_text = normalize_text(text)
            embedding = extract_sbert_embeddings([normalized_text])
            prediction = classifier.predict(embedding)[0]
            confidence = classifier.predict_proba(embedding)[0][1]
            if confidence >= threshold:
                relevant_projects.append(text)

    # Filtered results
    filtered_data = data[data[text_column].isin(relevant_projects)]

    # Save the results
    filtered_data.to_csv(output_csv, index=False)

    print(f"Total opportunities: {len(data)}")
    print(f"Filtered opportunities: {len(filtered_data)}")
    print(f"Filter rate: {len(filtered_data) / len(data) * 100:.2f}%")

    return filtered_data

In [35]:
import pandas as pd

train_with_sbert('augmented_Opportunités.csv', text_column="Intitulé du projet", label_column="Selection")

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Train Accuracy: 0.9135, Validation Accuracy: 0.8077
              precision    recall  f1-score   support

           0       1.00      0.38      0.55         8
           1       0.78      1.00      0.88        18

    accuracy                           0.81        26
   macro avg       0.89      0.69      0.71        26
weighted avg       0.85      0.81      0.78        26



0.8076923076923077

In [36]:
filter_csv('augmented_Opportunités.csv', 'filtered_results.csv', text_column="Intitulé du projet")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Total opportunities: 130
Filtered opportunities: 91
Filter rate: 70.00%


Unnamed: 0,N°,Client,N° consultation,Intitulé du projet,Lien,Selection
1,2,UNIVJENDOUBA - Université de Jendouba,N° 04/2023,création d'un site web au profit de l'universi...,,1
2,3,ISET Jendouba,S20240102471,"conception,développemnt et audit du portail dy...",https://www.tuneps.tn/portail/consultations/co...,1
3,4,ISET Jendouba,S20240102882,acquisition d'un logiciel pour la gestion numé...,https://www.tuneps.tn/portail/consultations/co...,1
4,5,Commune de Nabeul,S20240102982,acquisition d'un logicielle pour la Gestion de...,https://www.tuneps.tn/portail/consultations/co...,1
5,6,Office National des mines,S20240103734,mise en place d'un portail numérique pour la b...,https://www.tuneps.tn/portail/consultations/co...,1
...,...,...,...,...,...,...
116,41,SRT Médenine,,تطوير منصة لبيع الاشتراكات على الإنترنت,,1
117,42,ENSIT - Ecole Nationale Supérieure dIngénieurs...,,تم تركيب نظام معلومات جامعي SIU لمصلحة المعهد ...,,1
118,43,Office Nationale de tourisme,20241002335,اختيار مقدم خدمات لتطوير وتنفيذ برنامج رقمي لم...,,1
119,44,Ciments Bizerte,,تصميم وتطوير وتنفيذ حل على شبكة الويب العالمية...,,1


In [37]:
import pandas as pd

# Load the CSV files
df1 = pd.read_csv('augmented_Opportunités.csv')
df2 = pd.read_csv('filtered_results.csv')

# Ensure the column exists in both
column = "Intitulé du projet"

# Get only the rows from df1 where the value in 'Intitulé du projet' is not in df2
diff = df1[~df1[column].isin(df2[column])]

# Save the difference to a new CSV
#diff.to_csv('difference.csv', index=False)
print(diff)

#print("Rows present in file1.csv but not in file2.csv have been saved to difference.csv")

       N°                                             Client  \
0       1                                              ANETI   
8      88                           Hopital régional de béja   
11   1010  Centre National de Recherches en Sciences des ...   
15   1313  Commissariat Régional au Développement Agricol...   
18   1515              Ecole Nationale d’Ingénieurs de Tunis   
20   1616       Le conseil régional de gouvernorat de Ariana   
22   1717                        Foyer Universitaire Rakkada   
25   1919              Régie des Sondages Hydrauliques (RSH)   
29   2222  Direction Régionale de l’Equipement, de l’Habi...   
32   2424  Commissariat Régional au Développement Agricol...   
34   2525         Société Tunisienne de Sidérurgie ELFOULADH   
36   2626                       Office des Fermes Militaires   
56     46                        Conseil régional de siliana   
57     47            Commune Ech-Charayaa- Machrek Ech-Chams   
58     48      Office de l’Aviation Civi