In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import arff
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import KElbowVisualizer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn_extra.cluster import KMedoids
import os
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from IPython.display import display
from ipywidgets import widgets,interact, IntSlider,Output, VBox,HTML
from kneed import KneeLocator

# manipulating files
Ces fonctions sont utilisées pour manipuler différents fichiers (arff/csv) and listing files in répertoires

In [2]:
def readArffFile(file):
    data ,meta = arff.loadarff(file)
    df = pd.DataFrame(data)
    return df

In [3]:
def readCsvFile(file):
    data = pd.read_csv(file)
    return data   

In [4]:
def readFile(fileName):
    file_path = fr'C:\Users\DELL\Downloads\Data-20240416T120255Z-001\Data\{fileName}'
    df={}
    if(fileName.endswith("csv")):
        df=readCsvFile(file_path)
    else:
        df =readArffFile(file_path)
    return df

In [5]:
def listAllFilesInDirectory(dirPath):
    files = os.listdir(dirPath)
    return files

Cette fonction va transformer les données de type binaire en UTF-8 (données normales)

In [6]:
def update_dataframe(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
    return df

# preprocessing 
Le preprocessing est une étape essentielle Dans cette étape, nous nettoyons les données ,en montrant une description de notre data, en remplaçant les valeurs manquantes et en normalisant les données si nécessaire. Tout cela est crucial avant toute opération de regroupement ou d'analyse de données.

### Paramètres :
- `df` : notre data frame
### Retours :
- `df` : notre dataframe apres preprocessing

In [7]:
def preprocessing(df):
    global preprocess_output
    with preprocess_output:
        preprocess_output.clear_output()
        display_df(df.head(10))
    dfBoxPlot = pd.DataFrame()
    df=df.replace('?', np.nan)
    for col in df.columns:
        if df[col].dtype in ['float64', 'int64']:
            dfBoxPlot[col] = df[col]
            q1=df[col].quantile(0.25)
            q3=df[col].quantile(0.75)
            with preprocess_output:
                display(HTML(""))
                display(HTML(f"column name : {col}"))
                display(HTML(f"column median : {df[col].median()}"))
                display(HTML(f"column mode : {df[col].mode()[0]}"))
                display(HTML(f"column unique values : {df[col].unique()}"))
                display(HTML(""))
            QRI = 1.5*(q3 - q1)
            if(df[col].any() > q3+QRI or df[col].any() < q1 - QRI):
                  df[col] = (df[col] - df[col].mean()) / df[col].std()
                  
        else:
            with preprocess_output:
                display(HTML(""))
                display(HTML(f"column name : {col}"))
                display(HTML(f"column mode : {df[col].mode()[0]}"))
                display(HTML(f"column unique values : {df[col].unique()}"))
                display(HTML(""))
    if df.isna().any().any():
        imputer = SimpleImputer(strategy='most_frequent')
        updated_data = imputer.fit_transform(df)
        df = pd.DataFrame(updated_data, columns=df.columns)
    with preprocess_output:
        displayboxplot(dfBoxPlot)
    return df

# one hot encoding
Cette fonction nous aidera à travailler avec des données catégorielles en utilisant sklearn. Les fonctions de sklearn ne prennent pas en charge les données catégorielle

### Paramètres :
- `data` : notre dataframe

### Retours :
- `data` : notre dataframe apres codage

In [8]:
def encoding_data(data):
    numerical_cols = data.select_dtypes(include=['number']).columns
    categorical_cols = data.select_dtypes(exclude=['number']).columns

    encoded_categorical_cols = pd.get_dummies(data[categorical_cols])

    preprocessed_data = pd.concat([data[numerical_cols], encoded_categorical_cols], axis=1)

    return preprocessed_data

# clustring

# k-mean
Cette fonction est utilisee pour effectuer un regroupement k-mean

### Paramètres :
- `data` : Les donnees d'entre pour le regroupement.
- `n_clusters` : Le nombre de clusters selon l'elbow.

### Retours :
- `k-mean` : Le modele k-mean.

In [9]:
def kmeans(data, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(data)
    return kmeans

# kmedoids
Cette fonction est utilisée pour effectuer un clustoring kmedoids 

### Paramètres :
- `data` : Les données d'entrée pour le regroupement.
- `n_clusters` : Le nombre de clusters selon l'elbow.

### Retours :
- `kmedoids` : Le modele kmedoids.


In [10]:
def kmedoids(data, n_clusters):
    kmedoids = KMedoids(n_clusters=n_clusters)
    kmedoids.fit_predict(data)
    return kmedoids

# DbScan
Cette fonction est utilisee pour effectuer un regroupement DBSCAN 
### Paramètres :
- `data` : Les donnees d'entree pour le regroupement.
- `eps_range` : Un tuple specifiant la plage de valeurs epsilon a considerer pour DBSCAN. Par défaut, c'est (0.05, 40.1).
- `min_pts_range` : Un tuple specifiant la plage de points minimums pour DBSCAN. Par defaut, c'est (4, 20).

### Retours :
- `best_model` : Le modele DBSCAN avec le score de silhouette le plus eleve.



In [11]:
def dbScan(data, eps_range=(0.05, 40.1), min_pts_range=(4, 20)):

    best_model = None
    best_score = -1

    for eps in eps_range:
        for min_pts in min_pts_range:
            model = DBSCAN(eps=eps, min_samples=min_pts)
            model.fit(data)
            if(model is not None):
                labels = model.labels_
                if  np.all(labels == -1):
                    print("All data points are considered noise. Skipping DBSCAN.")
                    continue 
                if  np.all(labels == 0):
                    print("All data points are considered noise. Skipping DBSCAN.")
                    continue 
                score = silhouette_score(data, labels)
                print(score)
                if score > best_score:
                    best_model = model
                    best_score = score

    return best_model


# Agnes
Cette fonction est utilisee pour effectuer un regroupement Agnes 
### Paramètres :
- `data` : Les donnees d'entree pour le regroupement.
- `n_clusters` : nombre de clusters

### Retours :
- `Agnes model` : Le modele Agnes

In [12]:
def Agnes(data, n_clusters):
    with clustoring_output:
        dendrogram = sch.dendrogram(sch.linkage(data, method='ward'))
        plt.title('Dendrogram Agnes')
        plt.xlabel('Samples')
        plt.ylabel('Distance')
        plt.show()

    hc = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
    labels = hc.fit_predict(data)
    return hc

# Diana
Cette fonction est utilisee pour effectuer un regroupement Diana 
### Paramètres :
- `data` : Les donnees d'entree pour le regroupement.
- `n_clusters` : nombre de clusters

### Retours :
- `Diana model` : Le modele Diana

In [13]:
def Diana(data, n_clusters):
    with clustoring_output:
        dendrogram = sch.dendrogram(sch.linkage(data, method='ward'))
        plt.title('Dendrogram Diana')
        plt.xlabel('Samples')
        plt.ylabel('Distance')
        plt.show()

    hc = AgglomerativeClustering(n_clusters=1, affinity='euclidean', linkage='ward')
    labels = hc.fit_predict(data)

    for i in range(1, n_clusters):
        hc = AgglomerativeClustering(n_clusters=i+1, affinity='euclidean', linkage='ward')
        labels = hc.fit_predict(data)
    return hc

# elbow
### Paramètres :
- `data` : Les données d'entrée pour le regroupement.
- `model` : kmean or kmedoids 

### Retours :
- `elbow_value`:best nbr of cluster 

In [14]:
def elbow(data,model):
    k_range = range(1, 20)  
    if model == "KMeans":
        model = KMeans()
    elif model == "KMedoids":
        model = KMedoids()
        
    with clustoring_output:
        visualizer = KElbowVisualizer(model, k=k_range)


        visualizer.fit(data)


        visualizer.show()
    return visualizer.elbow_value_

In [16]:
files = os.listdir(r"C:\Users\DELL\Downloads\Data-20240416T120255Z-001\Data")
filesNp =np.array(files)
buttons=[widgets.Button(description = file.split(".")[0]) for file in files]
choosedFile = None
isPrep_proced = False
outputs=widgets.HBox([items for items in buttons])
df = None
df_output = Output(layout={'width': '100%'})
preprocess_output= Output(layout={'width': '100%'})
clustoring_output = Output(layout={'width': '100%'})

def display_df(df):
    html_table = df.to_html()
    styled_html = f"""
    <style>
        table {{ width: 150%; }}  </style>
    {html_table}
    """
    display(HTML(styled_html))
        
def displayboxplot(dfBoxPlot):
        with preprocess_output:
            if not dfBoxPlot.empty:
                dfBoxPlot.boxplot()
                plt.xticks(rotation=45)
                plt.title('Boxplots of Attributes')
                plt.xlabel('Attributes')
                plt.ylabel('Values')
                plt.show()      
        
def Clustoring(x):
    global isPrep_proced
    if(not isPrep_proced):
        error_message = "you cant make clustors without pre process data."
        with clustoring_output:
                clustoring_output.clear_output() 
                display(HTML(error_message))
    else:
        with clustoring_output:
            clustoring_output.clear_output() 
        nClusterKmean = elbow(df,"KMeans")
        nClusterKMedoids = elbow(df,"KMedoids")
        kmeans_model = kmeans(df,nClusterKmean)
        kmeans_labels = kmeans_model.labels_
        labels_df = pd.DataFrame({'label': kmeans_labels}) 
        with clustoring_output:
            predicted_labels = kmeans_model.predict(df)
            dfKmean =df.copy()
            dfKmean['cluster'] = predicted_labels
            title_kmean = """<h1 style="font-size: 4em; margin: 1em 0; text-align: center;">KMeans Clustering</h1>"""
            display(HTML(title_kmean))
            display_df(dfKmean)
        kmedoids_model = kmedoids(df,nClusterKMedoids)
        kmedoids_labels = kmedoids_model.labels_
        with clustoring_output:
            predicted_labels = kmedoids_model.predict(df)
            dfkmedoids =df.copy()
            dfkmedoids['cluster'] = predicted_labels
            title_kmedoids = """<h1 style="font-size: 4em; margin: 1em 0; text-align: center;">kmedoids Clustering</h1>"""
            display(HTML(title_kmedoids))
            display_df(dfkmedoids)
        Agnes_model =Agnes(df,nClusterKmean)
        Agnes_labels = Agnes_model.labels_
        with clustoring_output:
            predicted_labels = Agnes_model.fit_predict(df)
            dfAgnes =df.copy()
            dfAgnes['cluster'] = predicted_labels
            title_agnes = """<h1 style="font-size: 4em; margin: 1em 0; text-align: center;">Agnes Clustering</h1>"""
            display(HTML(title_agnes))
            display_df(dfAgnes)
        Diana_model=Diana(df,nClusterKMedoids)
        Diana_labels = Diana_model.labels_
        with clustoring_output:
            predicted_labels = Diana_model.fit_predict(df)
            dfDiana =df.copy()
            dfDiana['cluster'] = predicted_labels
            title_Diana = """<h1 style="font-size: 4em; margin: 1em 0; text-align: center;">Diana Clustering</h1>"""
            display(HTML(title_Diana))
            display_df(dfDiana)
        
        dbScan_model=dbScan(df)
        if(dbScan_model is not None) : 
            dbScan_label = dbScan_model.labels_
            with clustoring_output:
                predicted_labels = dbScan_model.fit_predict(df)
                dfdbScan =df.copy()
                dbScan_model['cluster'] = predicted_labels
                title_dbscan = """<h1 style="font-size: 4em; margin: 1em 0; text-align: center;">db scan Clustering</h1>"""
                display(HTML(title_dbscan))
                display_df(dbScan_model)
            print (dbScan_label)
            dbScan_silhouette_score= silhouette_score(df, dbScan_label)
        kmedoids_silhouette_score =silhouette_score(df, kmedoids_labels)
        kmeans_silhouette_score =silhouette_score(df, kmeans_labels)
        Diana_silhouette_score =silhouette_score(df, Diana_labels)
        Agnes_silhouette_score =silhouette_score(df, Agnes_labels)

        with clustoring_output:
            display(HTML(f"kmedoids_silhouette_score:{kmedoids_silhouette_score}"))
            display(HTML(f"kmeans_silhouette_score:{kmeans_silhouette_score}"))
            display(HTML(f"Agnes_silhouette_score:{ Agnes_silhouette_score}"))
            display(HTML(f"Diana_silhouette_score:{Diana_silhouette_score}"))
            if(dbScan_model is not None) : 
                display(HTML(f"dbScan_silhouette_score:{dbScan_silhouette_score}"))
        if(dbScan_model is not None) : 
            method_scores = {
                    "KMeans": kmeans_silhouette_score,  
                    "DBSCAN": dbScan_silhouette_score, 
                    "KMedoids":kmedoids_silhouette_score,
                    "Agnes":Agnes_silhouette_score,
                    "Diana":Diana_silhouette_score
                }
        else :
            method_scores = {
                    "KMeans": kmeans_silhouette_score,
                    "KMedoids":kmedoids_silhouette_score,
                    "Agnes":Agnes_silhouette_score,
                    "Diana":Diana_silhouette_score
                }
        with clustoring_output:
            plt.figure(figsize=(10, 6))
            plt.bar(method_scores.keys(), method_scores.values())
            plt.xlabel("Clustering Method")
            plt.ylabel("Silhouette Score")
            plt.title("Silhouette Score Comparison Across Clustering Methods")
            plt.grid(True)
            plt.xticks(rotation=45, ha='right')  
            plt.tight_layout()

def choseFile(x):
    global choosedFile 
    global df
    index = np.where([file.split(".")[0] == x.description for file in files])[0][0]
    choosedFile = filesNp[index]
    df=readFile(choosedFile)
    if(filesNp[index].split(".")[1] == "arff"):
        df = update_dataframe(df)
    with df_output:
        df_output.clear_output()
        display_df(df) 
def preProcessing(x):
        global choosedFile 
        global isPrep_proced
        global df
        global preprocess_output
        if choosedFile is not None:
                if "Class" in df.columns :
                    df.drop('Class', axis=1, inplace=True)
                if "class" in df.columns :
                    df.drop('class', axis=1, inplace=True)
                non_numeric_columns = df.select_dtypes(exclude=['number']).columns
                df=preprocessing(df)
                if len(non_numeric_columns) > 0:
                    df = encoding_data(df)
                isPrep_proced=True
        else:
            error_message = "You have to choose a file first to be able to pre-process."
            with preprocess_output:
                preprocess_output.clear_output() 
                display(HTML(error_message))

for button in buttons:
    button.on_click(choseFile)
    
buttonShowDatasets = widgets.Button(description='Show Data')
buttonpreProcessing = widgets.Button(description='preProcessing')
buttonClustoring = widgets.Button(description='Clustoring')
def showDataSets(b):
    if outputs.layout.display == 'none':
        outputs.layout.display = 'flex'
        b.description = 'Hide Data'
    else:
        outputs.layout.display = 'none'
        b.description = 'Show Data'

buttonShowDatasets.on_click(showDataSets)
buttonpreProcessing.on_click(preProcessing)
buttonClustoring.on_click(Clustoring)
display(buttonShowDatasets)

outputs.layout.display = 'none'
df_vbox = VBox(children=[df_output])
preproccess_vbox = VBox(children=[preprocess_output])
clustorings_vbox = VBox(children=[clustoring_output])
display(outputs)
display(df_vbox)
display(buttonpreProcessing)
display(preproccess_vbox)
display(buttonClustoring)
display(clustorings_vbox)


Button(description='Show Data', style=ButtonStyle())

HBox(children=(Button(description='breast', style=ButtonStyle()), Button(description='contact-lenses', style=B…

VBox(children=(Output(layout=Layout(width='100%')),))

Button(description='preProcessing', style=ButtonStyle())

VBox(children=(Output(layout=Layout(width='100%')),))

Button(description='Clustoring', style=ButtonStyle())

VBox(children=(Output(layout=Layout(width='100%')),))