In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import adjusted_rand_score, confusion_matrix
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertModel
import torch
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import linear_sum_assignment

ModuleNotFoundError: No module named 'google.colab'

# --- Chargement et nettoyage des données ---

In [None]:
# Télécharger les ressources nécessaires de NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# Charger les données du fichier CSV
data = pd.read_csv('Flipkart/flipkart_com-ecommerce_sample_1050.csv')
data = data[["uniq_id", "product_name", "description", "product_category_tree"]]

In [None]:
# Nettoyage et extraction des catégories
# Extraire uniquement la première catégorie d'une chaîne de catégories séparées par ">>"
data["product_category_tree"].replace(to_replace=r'[\["\]]', value="", regex=True, inplace=True)
category = data["product_category_tree"].str.split(" >> ", expand=True)
category.rename(columns={0: "Categorie"}, inplace=True)
data = data.join(category["Categorie"])

In [None]:
# Nettoyage des textes
# Fonction de nettoyage pour retirer la ponctuation, les chiffres et les majuscules
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.strip()

In [None]:
data['cleaned_description'] = data['description'].apply(clean_text)
data['cleaned_product_name'] = data['product_name'].apply(clean_text)

In [None]:
# Tokenisation, suppression des stop-words et lemmatisation
def preprocess_text(column):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    def process(text):
        tokens = word_tokenize(text)  # Tokeniser le texte
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    
    return column.apply(process)

In [None]:
# Application du prétraitement au texte nettoyé
data['processed_description'] = preprocess_text(data['cleaned_description'])

In [None]:
# Séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(data['processed_description'], data['Categorie'], test_size=0.3, random_state=0)

# --- Vectorisation Bag of Words (BoW) ---

In [None]:
vectorizer_bow = CountVectorizer()
X_train_bow = vectorizer_bow.fit_transform(X_train).toarray()
X_test_bow = vectorizer_bow.transform(X_test).toarray()

# --- Vectorisation TF-IDF ---

In [None]:

vectorizer_tfidf = TfidfVectorizer(min_df=0.1, max_df=0.85)
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train).toarray()
X_test_tfidf = vectorizer_tfidf.transform(X_test).toarray()

# --- Word2Vec ---

In [None]:
# Entraînement d'un modèle Word2Vec sur les données d'entraînement
sentences = [sentence.split() for sentence in X_train]
w2v_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
# Fonction pour obtenir les embeddings Word2Vec d'une liste de phrases
def get_word2vec_embeddings(text_list, model):
    return np.array([np.mean([model.wv[word] for word in text.split() if word in model.wv], axis=0) for text in text_list])

In [None]:
X_train_w2v = get_word2vec_embeddings(X_train, w2v_model)
X_test_w2v = get_word2vec_embeddings(X_test, w2v_model)

In [None]:
# --- Universal Sentence Encoder (USE) ---

In [None]:
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
X_train_use = use_model(X_train.tolist()).numpy()
X_test_use = use_model(X_test.tolist()).numpy()

In [None]:
# --- BERT ---

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
# Fonction pour extraire les embeddings de BERT
def get_bert_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return torch.mean(outputs.last_hidden_state, dim=1).squeeze().numpy()

In [None]:
X_train_bert = np.array([get_bert_embeddings(sentence, tokenizer, bert_model) for sentence in X_train])
X_test_bert = np.array([get_bert_embeddings(sentence, tokenizer, bert_model) for sentence in X_test])

# --- KMeans et calcul d'ARI ---

In [None]:
def kmeans_clustering(X_train, X_test, y_test, method_name, num_clusters=7):
    kmeans = KMeans(n_clusters=num_clusters, random_state=0)
    kmeans.fit(X_train)
    y_pred = kmeans.predict(X_test)

    # Encodage des catégories réelles pour le calcul de l'ARI
    label_encoder = LabelEncoder()
    y_test_encoded = label_encoder.fit_transform(y_test)

    # Calcul de l'ARI (Adjusted Rand Index) pour mesurer la qualité du clustering
    ari = adjusted_rand_score(y_test_encoded, y_pred)

    # Création et affichage de la matrice de confusion avec appariement optimal des clusters
    conf_mat = confusion_matrix(y_test_encoded, y_pred)
    row_ind, col_ind = linear_sum_assignment(-conf_mat)
    cluster_to_label = {cluster: label_encoder.inverse_transform([label])[0] for cluster, label in zip(col_ind, row_ind)}
    y_pred_mapped = [cluster_to_label[label] for label in y_pred]

    new_conf_mat = confusion_matrix(y_test, y_pred_mapped)

    plt.figure(figsize=(8, 6))
    sns.heatmap(new_conf_mat, annot=True, cmap='Blues', fmt='d')
    plt.title(f'Matrice de confusion ({method_name}) - KMeans ({X_train.shape[1]} features)')
    plt.savefig(f'Texte - Matrice de confusion ({method_name}).png')
    plt.show()

    # Réduction de dimension pour la visualisation avec t-SNE
    tsne = TSNE(n_components=2, random_state=0)
    X_test_tsne = tsne.fit_transform(X_test)

    # Visualisation KMeans avec les labels prédits
    plt.figure(figsize=(12, 6))
    sns.scatterplot(x=X_test_tsne[:, 0], y=X_test_tsne[:, 1], hue=y_pred, palette='viridis')
    plt.title(f'KMeans ({method_name}) - Visualisation avec Labels Prédits (t-SNE)')
    plt.xlabel('t-SNE Dimension 1')
    plt.ylabel('t-SNE Dimension 2')
    plt.legend(title="Clusters prédits")
    plt.savefig(f'Texte - TSNE - K-Means Labels prédits ({method_name}).png')
    plt.show()

    # Visualisation KMeans avec les labels réels
    plt.figure(figsize=(12, 6))
    sns.scatterplot(x=X_test_tsne[:, 0], y=X_test_tsne[:, 1], hue=y_test, palette='viridis')
    plt.title(f'KMeans ({method_name}) - Visualisation avec Labels Réels (t-SNE)')
    plt.xlabel('t-SNE Dimension 1')
    plt.ylabel('t-SNE Dimension 2')
    plt.legend(title="Labels réels")
    plt.savefig(f'Texte - TSNE - K-Means Labels réels ({method_name}).png')
    plt.show()

    return ari

In [None]:
# --- Visualisation avec PCA et t-SNE ---

In [None]:
def visualize_pca_tsne(X_train, y_train, method_name):
    pca = PCA(n_components=2)
    X_train_pca = pca.fit_transform(X_train)
    
    tsne = TSNE(n_components=2, random_state=0, init='random')
    X_train_tsne = tsne.fit_transform(X_train)
    
    # Visualisation des clusters obtenus par PCA
    plt.figure(figsize=(12, 6))
    sns.scatterplot(x=X_train_pca[:, 0], y=X_train_pca[:, 1], hue=y_train, palette='viridis')
    plt.title(f'PCA - {method_name}')
    plt.show()

    # Visualisation des clusters obtenus par t-SNE
    plt.figure(figsize=(12, 6))
    sns.scatterplot(x=X_train_tsne[:, 0], y=X_train_tsne[:, 1], hue=y_train, palette='viridis')
    plt.title(f't-SNE - {method_name}')
    plt.show()

In [None]:
# --- Application sur chaque méthode ---

In [None]:
methods = {
    'Bag of Words': (X_train_bow, X_test_bow),
    'TF-IDF': (X_train_tfidf, X_test_tfidf),
    'Word2Vec': (X_train_w2v, X_test_w2v),
    'BERT': (X_train_bert, X_test_bert),
    'USE': (X_train_use, X_test_use)
}

In [None]:
results = []

for method_name, (X_train_vec, X_test_vec) in methods.items():
    # Calcul de l'ARI et affichage des résultats pour chaque méthode
    ari = kmeans_clustering(X_train_vec, X_test_vec, y_test)
    results.append({'Technique': method_name, 'ARI': ari})
    visualize_pca_tsne(X_train_vec, y_train, method_name)

In [None]:
# --- Résultats finaux ---

In [None]:
results_df = pd.DataFrame(results)
print(results_df)

In [None]:
# Visualisation des ARI pour chaque méthode
plt.figure(figsize=(10, 6))
sns.barplot(x='Technique', y='ARI', data=results_df, palette='Blues_d')
plt.title('Comparaison des résultats ARI pour différentes techniques de traitement de texte')
plt.show()