# Chargement du jeu de données traité


In [2]:
import pandas as pd 
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

data_path="/opt/airflow/data/raw/us_airline_sentiment_raw.csv"
df=pd.read_csv(data_path)
df["text_clean"] = df["text"].str.replace(r'@[^\s]+', '', regex=True)
df["text_clean"] = df["text_clean"].fillna("").str.strip()
df["text_clean"] = df["text_clean"].str.replace(r"\s+", " ", regex=True)
df.rename(columns={'tweet_id': 'id', 'airline_sentiment': 'label'}, inplace=True)

df["id"]=df["id"].astype(str)
df.head()

Unnamed: 0,id,label,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,name,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,text_clean
0,5.70306e+17,neutral,1.0,,,Virgin America,cairdin,0,@VirginAmerica What @dhepburn said.,,2/24/2015 11:35,,Eastern Time (US & Canada),What said.
1,5.70301e+17,positive,0.3486,,0.0,Virgin America,jnardino,0,@VirginAmerica plus you've added commercials t...,,2/24/2015 11:15,,Pacific Time (US & Canada),plus you've added commercials to the experienc...
2,5.70301e+17,neutral,0.6837,,,Virgin America,yvonnalynn,0,@VirginAmerica I didn't today... Must mean I n...,,2/24/2015 11:15,Lets Play,Central Time (US & Canada),I didn't today... Must mean I need to take ano...
3,5.70301e+17,negative,1.0,Bad Flight,0.7033,Virgin America,jnardino,0,@VirginAmerica it's really aggressive to blast...,,2/24/2015 11:15,,Pacific Time (US & Canada),"it's really aggressive to blast obnoxious ""ent..."
4,5.70301e+17,negative,1.0,Can't Tell,1.0,Virgin America,jnardino,0,@VirginAmerica and it's a really big bad thing...,,2/24/2015 11:14,,Pacific Time (US & Canada),and it's a really big bad thing about it


Remarque — Vérification du chargement
Si la table s'affiche, le jeu de données traité est disponible et prêt pour l'encodage. Sinon, vérifiez le chemin du fichier ou exécutez d'abord l'étape de nettoyage (`03_text_cleaning.ipynb`).

# Split avant toute augmentation

In [3]:
X = df[["id", "text_clean"]]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

train_df = pd.concat([X_train, y_train], axis=1)


## Augmenter UNIQUEMENT les classes minoritaires du train


In [4]:
negative_df = train_df[train_df['label'] == 'negative']
neutral_df  = train_df[train_df['label'] == 'neutral']
positive_df = train_df[train_df['label'] == 'positive']

max_count = len(negative_df)
augmented_data = []


# Neutral

In [5]:
neutral_samples = neutral_df.sample(
    n=max_count - len(neutral_df),
    replace=True,
    random_state=42
)

for row in neutral_samples.itertuples():
    try:
        aug_text = aug_synonym.augment(row.text_clean)
        augmented_data.append({
            "text_clean": aug_text,
            "label": row.label
        })
    except:
        pass
print(len(neutral_samples))

4864


# Positive

In [9]:
augmented_data = []

positive_samples = positive_df.sample(
    n=max_count - len(positive_df),
    replace=True,
    random_state=42
)

for row in positive_samples.itertuples():
    try:
        aug_text = aug_synonym.augment(row.text_clean)
        augmented_data.append({
            "text_clean": aug_text,
            "label": row.label
        })
    except:
        pass


# Reconstituer le TRAIN final

In [10]:
augmented_df = pd.DataFrame(augmented_data)

train_df_final = pd.concat(
    [train_df[["text_clean", "label"]], augmented_df],
    ignore_index=True
)



# Génération des embeddings

Nous avons choisi le modèle intfloat/e5-large-v2 car il génère des embeddings de qualité pour capturer la similarité sémantique entre phrases, ce qui est idéal pour notre analyse de texte.
Il est également léger et rapide, ce qui permet de traiter de gros volumes de données sans trop de ressources.
Ainsi, il représente un bon compromis entre précision et performance pour notre projet.

## Encoder uniquement le train augmenté

In [25]:


model_name="intfloat/e5-large-v2"
device="cuda" if torch.cuda.is_available() else "cpu"

print(f"Using device: {device}")

train_embedding_path="/opt/airflow/data/embeddings/train_embedding.npy"
test_embedding_path="/opt/airflow/data/embeddings/test_embedding.npy"
train_metadata_path="/opt/airflow/data/metadata/train_metadata.csv"
test_metadata_path="/opt/airflow/data/metadata/test_metadata.csv"



model=SentenceTransformer(model_name,device)
print("Encoding train embeddings...")

train_embedding = model.encode(
    train_df_final["text_clean"].tolist(),
    batch_size=64,
    convert_to_numpy=True,
    show_progress_bar=True
)


print("Train embeddings done!")

print("Encoding test embeddings...")

test_embedding = model.encode(
    X_test["text_clean"].tolist(),
    batch_size=64,
    convert_to_numpy=True,
    show_progress_bar=True
)

print("Test embeddings done!")

X_train["id"] = X_train["id"].astype(str) + "_" + X_train["id"].index.astype(str)
X_test["id"] = X_test["id"].astype(str) + "_" + X_test["id"].index.astype(str)

train_metadata=pd.DataFrame({
    "id":X_train["id"].tolist(),
    "label":y_train.to_numpy()
})

test_metadata=pd.DataFrame({
    "id":X_test["id"].tolist(),
    "label":y_test.to_numpy()
})

train_metadata.to_csv(train_metadata_path,index=False)
test_metadata.to_csv(test_metadata_path,index=False)

np.save(train_embedding_path,train_embedding)
np.save(test_embedding_path,test_embedding)

print("Embeddings and metadata saved.")


Using device: cpu
Encoding train embeddings...


Batches: 100%|██████████| 183/183 [14:42<00:00,  4.82s/it]


Train embeddings done!
Encoding test embeddings...


Batches: 100%|██████████| 46/46 [03:48<00:00,  4.97s/it]


Test embeddings done!
Embeddings and metadata saved.


- `train_embedding.npy` et `test_embedding.npy` contiennent les vecteurs d'embeddings au format NumPy.
- `train_metadata.csv` et `test_metadata.csv` contiennent les `id` et `label` associés aux embeddings.
Ces fichiers servent pour l'entraînement du classifieur, l'évaluation et l'indexation dans une base vectorielle.
Conservez-les dans `data/embeddings` et `data/metadata` pour les étapes suivantes.

#  Initialisation de la base vectorielle (Chroma)
La création d'un client Chroma persistant pointant vers `data/chroma_db`. Cela permet d'indexer et de rechercher des embeddings localement.

In [29]:
import chromadb
from chromadb.config import Settings


In [30]:
client=chromadb.PersistentClient(path='/opt/airflow/data/chroma_dataBase')
train_collection=client.create_collection("avis_train",get_or_create=True)
test_colletion=client.get_or_create_collection(name="avis_test")


# Indexation des embeddings — train
Remarque : la boucle suivante ajoute les embeddings d'entraînement par lots dans la collection `avis_train`. 

In [31]:
batch_size = 1000
metadatas_full = [{"label": i, "split":"train"} for i in train_metadata["label"].to_numpy()]
n = len(train_metadata["id"])

if "avis_train" in client.list_collections():
    client.delete_collection("avis_train")

train_collection = client.get_or_create_collection("avis_train")

for i in range(0, n, batch_size):  
    ids = train_metadata["id"][i:i+batch_size].tolist()
    metadatas = metadatas_full[i:i+batch_size]
    documents = X_train['text_clean'][i:i+batch_size].tolist()  
    batch_embeddings = train_embedding[i:i+batch_size].tolist()  

    train_collection.add(
        ids=ids,
        embeddings=batch_embeddings,
        metadatas=metadatas,
        documents=documents  
    )
    print(f"Added train batch {i} to {i+len(ids)}")


Added train batch 0 to 1000
Added train batch 1000 to 2000
Added train batch 2000 to 3000
Added train batch 3000 to 4000
Added train batch 4000 to 5000
Added train batch 5000 to 6000
Added train batch 6000 to 7000
Added train batch 7000 to 8000
Added train batch 8000 to 9000
Added train batch 9000 to 10000
Added train batch 10000 to 11000
Added train batch 11000 to 11712


Après exécution, la collection `avis_train` contiendra les embeddings d'entraînement.

# Indexation des embeddings — test
La boucle suivante ajoute les embeddings de test par lots dans la collection `avis_test`. Les impressions indiquent la progression.

In [32]:
batch_size = 1000
metadatas_full = [{"label": i, "split":"test"} for i in test_metadata["label"].to_numpy()]
n = len(test_metadata["id"])

if "avis_test" in client.list_collections():
    client.delete_collection("avis_test")

test_collection = client.get_or_create_collection("avis_test")

for i in range(0, n, batch_size):  
    ids = test_metadata["id"][i:i+batch_size].tolist()
    metadatas = metadatas_full[i:i+batch_size]
    documents = X_test['text_clean'][i:i+batch_size].tolist()  
    batch_embeddings = test_embedding[i:i+batch_size].tolist()  

    test_collection.add(
        ids=ids,
        embeddings=batch_embeddings,
        metadatas=metadatas,
        documents=documents  
    )
    print(f"Added test batch {i} to {i+len(ids)}")


Added test batch 0 to 1000
Added test batch 1000 to 2000
Added test batch 2000 to 2928
