In [None]:
import tkinter as tk
from tkinter import filedialog, messagebox
import pandas as pd
import numpy as np

def initialize_centroids(data, k):
    indices = np.random.choice(data.shape[0], k, replace=False)
    centroids = data[indices]
    print(centroids)
    return centroids

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

def assign_clusters(data, centroids):
    clusters = [[] for _ in range(len(centroids))]
    for idx, point in enumerate(data):
        distances = [euclidean_distance(point, centroid) for centroid in centroids]
        cluster_idx = np.argmin(distances)
        clusters[cluster_idx].append(idx)
    return clusters

def update_centroids(data, clusters):
    centroids = [np.mean(data[cluster], axis=0) for cluster in clusters]
    return np.array(centroids)

def kmeans(data, k, max_iters=100):
    centroids = initialize_centroids(data, k)
    for _ in range(max_iters):
        clusters = assign_clusters(data, centroids)
        prev_centroids = centroids
        centroids = update_centroids(data, clusters)
        print(centroids)
        centroids = np.round(centroids, decimals=2)  # Round centroids to 2 decimal places
        if np.all(prev_centroids == centroids):
            break
    return centroids, clusters

def browse_file():
    filename = filedialog.askopenfilename()
    entry_file.delete(0, tk.END)
    entry_file.insert(0, filename)

def check_outliers(data, percentage):
    num_data_points = int(len(data) * percentage)
    selected_data = data.sample(frac=percentage, random_state=1)
    df_sorted = selected_data.sort_values('IMDB Rating')
    Q1 = df_sorted['IMDB Rating'].quantile(0.25)
    Q3 = df_sorted['IMDB Rating'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df_sorted[(df_sorted['IMDB Rating'] < lower_bound) | (df_sorted['IMDB Rating'] > upper_bound)]
    outliers_removed = df_sorted[(df_sorted['IMDB Rating'] >= lower_bound) & (df_sorted['IMDB Rating'] <= upper_bound)]
    return outliers, outliers_removed

def run_kmeans_with_outliers():
    try:
        # Read input file
        file_path = entry_file.get()
        df = pd.read_csv(file_path)

        # Take input for percentage of data and k
        percentage = float(entry_percentage.get())
        k = int(entry_k.get())

        # Check for outliers based on the given percentage
        outliers, outliers_removed = check_outliers(df, percentage)

        # Perform K-Means clustering
        data = outliers_removed['IMDB Rating'].values.reshape(-1, 1)
        centroids, clusters = kmeans(data, k)

        # Display centroids and cluster sizes in GUI
        text_output.delete(1.0, tk.END)
        text_output.insert(tk.END, "Centroids:\n")
        text_output.insert(tk.END, str(centroids) + "\n\n")
        text_output.insert(tk.END, "Cluster Sizes:\n")
        for i, cluster in enumerate(clusters):
            text_output.insert(tk.END, f"Cluster {i + 1}: {len(cluster)} points\n")

        # Display outliers in GUI
        text_output.insert(tk.END, f"\nSize of Outliers: {len(outliers)}\n")
        for idx, outlier in outliers.iterrows():
            text_output.insert(tk.END, f"Outlier Movie Name: {outlier['Movie Name']}, Rating: {outlier['IMDB Rating']}\n")

        # Display content of each cluster in GUI
        text_output.insert(tk.END, "\nContent of each cluster:\n")
        for i, cluster in enumerate(clusters):
            text_output.insert(tk.END, f"\nCluster {i + 1}:\n")
            for idx in cluster:
                text_output.insert(tk.END, f"Movie Name: {outliers_removed.iloc[idx]['Movie Name']}, Rating: {data[idx][0]}\n")

        # Show success message
        messagebox.showinfo("K-Means Clustering", "K-Means clustering with outliers removed completed successfully.")
    except Exception as e:
        messagebox.showerror("Error", str(e))

# Create GUI
root = tk.Tk()
root.title("K-Means Clustering")

# File selection
label_file = tk.Label(root, text="Select File:")
label_file.grid(row=0, column=0, sticky="w")
entry_file = tk.Entry(root, width=50)
entry_file.grid(row=0, column=1, padx=5)
button_browse = tk.Button(root, text="Browse", command=browse_file)
button_browse.grid(row=0, column=2, padx=5)

# Percentage of data
label_percentage = tk.Label(root, text="Percentage of Data:")
label_percentage.grid(row=1, column=0, sticky="w")
entry_percentage = tk.Entry(root)
entry_percentage.grid(row=1, column=1, padx=5)

# Number of clusters
label_k = tk.Label(root, text="Number of Clusters (K):")
label_k.grid(row=2, column=0, sticky="w")
entry_k = tk.Entry(root)
entry_k.grid(row=2, column=1, padx=5)

# Run button for K-Means with outliers removed
button_run = tk.Button(root, text="Run K-Means with Outliers Removed", command=run_kmeans_with_outliers)
button_run.grid(row=3, column=1, pady=10)

# Output text area
label_output = tk.Label(root, text="Output:")
label_output.grid(row=4, column=0, sticky="w")
text_output = tk.Text(root, height=20, width=100)
text_output.grid(row=4, column=1, columnspan=2, padx=5, pady=5)

root.mainloop()


[[7.1]
 [7.8]
 [7.4]]
[[6.46639004]
 [7.99787234]
 [7.4490566 ]]
[[6.28783069]
 [8.06363636]
 [7.34180328]]
[[6.22339181]
 [7.99787234]
 [7.22764228]]
[[6.13825503]
 [7.99787234]
 [7.16275862]]
[[6.05116279]
 [7.95619048]
 [7.07142857]]
[[5.99487179]
 [7.95619048]
 [7.0373494 ]]
[[5.99487179]
 [7.89262295]
 [6.98456376]]
[[5.87789474]
 [7.89262295]
 [6.92222222]]
[[5.80952381]
 [7.89262295]
 [6.89065934]]
[[5.80952381]
 [7.84850746]
 [6.85470588]]
[[5.80952381]
 [7.84850746]
 [6.85470588]]
