In [101]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import arff
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import KElbowVisualizer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn_extra.cluster import KMedoids
import os
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from IPython.display import display
from ipywidgets import widgets,interact, IntSlider,Output, VBox,HTML

In [102]:
def readArffFile(file):
    data ,meta = arff.loadarff(file)
    df = pd.DataFrame(data)
    return df

In [103]:
def normalization(data):
    normalized_data = pd.DataFrame()
    for col in data.columns:
            normalized_data[col] = (data[col] - data[col].mean()) / data[col].std()
    return normalized_data

In [104]:
def preprocessing(df):
    global preprocess_output
    with preprocess_output:
        preprocess_output.clear_output()
        display_df(df.head(10))
    dfBoxPlot = pd.DataFrame()
    df=df.replace('?', np.nan)
    for col in df.columns:
        if df[col].dtype in ['float64', 'int64']:
            dfBoxPlot[col] = df[col]
            q1=df[col].quantile(0.25)
            q3=df[col].quantile(0.75)
            with preprocess_output:
                display(HTML(""))
                display(HTML(f"column name : {col}"))
                display(HTML(f"column median : {df[col].median()}"))
                display(HTML(f"column mode : {df[col].mode()[0]}"))
                display(HTML(f"column unique values : {df[col].unique()}"))
                display(HTML(""))
            QRI = 1.5*(q3 - q1)
            if(df[col].any() > q3+QRI or df[col].any() < q1 - QRI):
                  df[col] = (df[col] - df[col].mean()) / df[col].std()
                  
        else:
            with preprocess_output:
                display(HTML(""))
                display(HTML(f"column name : {col}"))
                display(HTML(f"column mode : {df[col].mode()[0]}"))
                display(HTML(f"column unique values : {df[col].unique()}"))
                display(HTML(""))
    if df.any().any():
        imputer = SimpleImputer(strategy='most_frequent')
        updated_data = imputer.fit_transform(df)
        df = pd.DataFrame(updated_data, columns=df.columns)
    with preprocess_output:
        displayboxplot(dfBoxPlot)
    return df

# one hot encoding

In [105]:
def encoding_data(data):
    numerical_cols = data.select_dtypes(include=['number']).columns
    categorical_cols = data.select_dtypes(exclude=['number']).columns

    encoded_categorical_cols = pd.get_dummies(data[categorical_cols])

    preprocessed_data = pd.concat([data[numerical_cols], encoded_categorical_cols], axis=1)

    return preprocessed_data

In [106]:
def readCsvFile(file):
    data = pd.read_csv(file)
    return data   

In [107]:
def listAllFilesInDirectory(dirPath):
    files = os.listdir(dirPath)
    return files

In [108]:
def readFile(fileName):
    file_path = fr'C:\Users\DELL\Downloads\Data-20240416T120255Z-001\Data\{fileName}'
    df={}
    if(fileName.endswith("csv")):
        df=readCsvFile(file_path)
    else:
        df =readArffFile(file_path)
    return df

In [109]:
def update_dataframe(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
    return df

# clustring

# k-mean

In [110]:
def kmeans(data, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(data)
    labels = kmeans.labels_
    return labels

# kmedoids

In [111]:
def kmedoids(data, n_clusters):
    kmedoids = KMedoids(n_clusters=n_clusters)
    labels = kmedoids.fit_predict(data)
    return labels

# DbScan

In [112]:
def dbScan(data):
    dbscan = DBSCAN(eps=5, min_samples=4)
    dbscan_model = dbscan.fit(data)
    labels = dbscan_model.labels_
    return labels

# Agnes

In [117]:
def Agnes(data, n_clusters):
    with clustoring_output:
        dendrogram = sch.dendrogram(sch.linkage(data, method='ward'))
        plt.title('Dendrogram Agnes')
        plt.xlabel('Samples')
        plt.ylabel('Distance')
        plt.show()

    hc = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
    labels = hc.fit_predict(data)
    return labels

# Diana

In [118]:
def Diana(data, n_clusters):
    with clustoring_output:
        dendrogram = sch.dendrogram(sch.linkage(data, method='ward'))
        plt.title('Dendrogram Diana')
        plt.xlabel('Samples')
        plt.ylabel('Distance')
        plt.show()

    hc = AgglomerativeClustering(n_clusters=1, affinity='euclidean', linkage='ward')
    labels = hc.fit_predict(data)

    for i in range(1, n_clusters):
        hc = AgglomerativeClustering(n_clusters=i+1, affinity='euclidean', linkage='ward')
        labels = hc.fit_predict(data)
    return labels

# elbow

In [115]:
def elbow(data,model):
    k_range = range(1, 20)  
    if model == "KMeans":
        model = KMeans()
    elif model == "KMedoids":
        model = KMedoids()
        
    with clustoring_output:
        visualizer = KElbowVisualizer(model, k=k_range)


        visualizer.fit(data)


        visualizer.show()
    return visualizer.elbow_value_

In [120]:
files = os.listdir(r"C:\Users\DELL\Downloads\Data-20240416T120255Z-001\Data")
filesNp =np.array(files)
buttons=[widgets.Button(description = file.split(".")[0]) for file in files]
choosedFile = None
isPrep_proced = False
outputs=widgets.HBox([items for items in buttons])
df = None
df_output = Output(layout={'width': '100%'})
preprocess_output= Output(layout={'width': '100%'})
clustoring_output = Output(layout={'width': '100%'})
def display_df(df):
    html_table = df.to_html()
    styled_html = f"""
    <style>
        table {{ width: 150%; }}  </style>
    {html_table}
    """
    display(HTML(styled_html))
        
def displayboxplot(dfBoxPlot):
        with preprocess_output:
            if not dfBoxPlot.empty:
                dfBoxPlot.boxplot()
                plt.xticks(rotation=45)
                plt.title('Boxplots of Attributes')
                plt.xlabel('Attributes')
                plt.ylabel('Values')
                plt.show()      
        
def Clustoring(x):
    global isPrep_proced
    if(not isPrep_proced):
        error_message = "you cant make clustors without pre process data."
        with clustoring_output:
                clustoring_output.clear_output() 
                display(HTML(error_message))
    else:
        with clustoring_output:
            clustoring_output.clear_output() 
        nClusterKmean = elbow(df,"KMeans")
        nClusterKMedoids = elbow(df,"KMedoids")
        kmeans_labels = kmeans(df,nClusterKmean)
        kmedoids_labels = kmedoids(df,nClusterKMedoids)
        Agnes_labels=Agnes(df,nClusterKmean)
        Diana_labels=Diana(df,nClusterKMedoids)
        dbScan_labels=dbScan(df)
        kmedoids_silhouette_score =silhouette_score(df, kmedoids_labels)
        kmeans_silhouette_score =silhouette_score(df, kmeans_labels)
def choseFile(x):
    global choosedFile 
    global df
    index = np.where([file.split(".")[0] == x.description for file in files])[0][0]
    choosedFile = filesNp[index]
    df=readFile(choosedFile)
    if(filesNp[index].split(".")[1] == "arff"):
        df = update_dataframe(df)
    with df_output:
        df_output.clear_output()
        display_df(df) 
def preProcessing(x):
        global choosedFile 
        global isPrep_proced
        global df
        global preprocess_output
        if choosedFile is not None:
                if "Class" in df.columns :
                    df.drop('Class', axis=1, inplace=True)
                if "class" in df.columns :
                    df.drop('class', axis=1, inplace=True)
                non_numeric_columns = df.select_dtypes(exclude=['number']).columns
                if len(non_numeric_columns) > 0:
                    df = encoding_data(df)
                df=preprocessing(df)
                isPrep_proced=True
        else:
            error_message = "You have to choose a file first to be able to pre-process."
            with preprocess_output:
                preprocess_output.clear_output() 
                display(HTML(error_message))

for button in buttons:
    button.on_click(choseFile)
    
buttonShowDatasets = widgets.Button(description='Show Data')
buttonpreProcessing = widgets.Button(description='preProcessing')
buttonClustoring = widgets.Button(description='Clustoring')
def showDataSets(b):
    if outputs.layout.display == 'none':
        outputs.layout.display = 'flex'
        b.description = 'Hide Data'
    else:
        outputs.layout.display = 'none'
        b.description = 'Show Data'

buttonShowDatasets.on_click(showDataSets)
buttonpreProcessing.on_click(preProcessing)
buttonClustoring.on_click(Clustoring)
display(buttonShowDatasets)

outputs.layout.display = 'none'
df_vbox = VBox(children=[df_output])
preproccess_vbox = VBox(children=[preprocess_output])
clustorings_vbox = VBox(children=[clustoring_output])
display(outputs)
display(df_vbox)
display(buttonpreProcessing)
display(preproccess_vbox)
display(buttonClustoring)
display(clustorings_vbox)


Button(description='Show Data', style=ButtonStyle())

HBox(children=(Button(description='breast', style=ButtonStyle()), Button(description='contact-lenses', style=B…

VBox(children=(Output(layout=Layout(width='100%')),))

Button(description='preProcessing', style=ButtonStyle())

VBox(children=(Output(layout=Layout(width='100%')),))

Button(description='Clustoring', style=ButtonStyle())

VBox(children=(Output(layout=Layout(width='100%')),))