In [7]:
import pandas as pd
import numpy as np
import tkinter as tk
from tkinter import filedialog, messagebox

def initialize_centroids(data, k):
    return np.random.choice(data, size=k, replace=False)

def assign_clusters(data, centroids):
    return np.argmin(np.abs(data[:, np.newaxis] - centroids), axis=1)

def update_centroids(data, clusters, k):
    centroids = np.zeros(k)
    for i in range(k):
        cluster_points = data[clusters == i]
        if len(cluster_points) > 0:
            centroids[i] = np.mean(cluster_points)
        else:
            centroids[i] = np.random.choice(data)
    return centroids

def kmeans(data, k, max_iterations=100):
    centroids = initialize_centroids(data, k)
    for _ in range(max_iterations):
        clusters = assign_clusters(data, centroids)
        new_centroids = update_centroids(data, clusters, k)
        if np.array_equal(centroids, new_centroids):
            break
        centroids = new_centroids
    return clusters, centroids

# Function to handle file selection
def browse_file():
    file_path = filedialog.askopenfilename(filetypes=[("CSV files", "*.csv")])
    entry_path.delete(0, tk.END)
    entry_path.insert(0, file_path)

# Function to handle K-Means clustering
def perform_clustering():
    # Read the data from the input file
    input_file_path = entry_path.get()
    try:
        data = pd.read_csv(input_file_path)
    except Exception as e:
        messagebox.showerror("Error", f"Failed to read the file: {e}")
        return

    # Preprocess the data
    data['IMDB Rating'] = pd.to_numeric(data['IMDB Rating'], errors='coerce')
    data.dropna(subset=['IMDB Rating'], inplace=True)

    # Detect outliers based on the entire dataset
    q1 = data['IMDB Rating'].quantile(0.25)
    q3 = data['IMDB Rating'].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = data[(data['IMDB Rating'] < lower_bound) | (data['IMDB Rating'] > upper_bound)]

    # Exclude outliers from the dataset
    data = data[(data['IMDB Rating'] >= lower_bound) & (data['IMDB Rating'] <= upper_bound)]

    # Get user input
    percentage = float(entry_percentage.get())
    k = int(entry_k.get())

    # Calculate the number of records to be read based on the percentage provided by the user
    num_records = int(len(data) * (percentage / 100))

    # Select the required number of records randomly from the DataFrame
    selected_data = data.sample(n=num_records, random_state=42)

    # Run K-Means algorithm
    data_array = selected_data['IMDB Rating'].values
    clusters, centroids = kmeans(data_array, k)

    # Assign clusters to the selected data
    selected_data['Cluster'] = clusters

    # Display Clusters and Outliers
    text_output.delete(1.0, tk.END)  # Clear previous output

    # Display clusters
    for cluster_id in range(k):
        cluster_data = selected_data[selected_data['Cluster'] == cluster_id]
        cluster_info = cluster_data[['Movie Name', 'IMDB Rating']]

        # Display cluster name
        text_output.insert(tk.END, f"\n\nCluster {cluster_id + 1}:\n", "green_bold")

        # Display cluster contents
        for idx, row in cluster_info.iterrows():
            text_output.insert(tk.END, f"{row['Movie Name']:<30} {row['IMDB Rating']:<10.2f}\n")

    # Display outliers separately
    text_output.insert(tk.END, "\n\nOutliers:\n", "red_bold")
    for idx, row in outliers.iterrows():
        text_output.insert(tk.END, f"{row['Movie Name']:<30} {row['IMDB Rating']:<10.2f}\n", "black")

# Initialize GUI
root = tk.Tk()
root.title("K-means Using IMDB Rating")

# Set background color
root.config(bg="#F0F0F0")

# Create Label Frames
file_frame = tk.LabelFrame(root, text="File", bg="#E6E6E6", padx=10, pady=5)
file_frame.grid(row=0, column=0, padx=10, pady=5, sticky="ew")

options_frame = tk.LabelFrame(root, text="Options", bg="#E6E6E6", padx=10, pady=5)
options_frame.grid(row=1, column=0, padx=10, pady=5, sticky="ew")

button_frame = tk.LabelFrame(root, bg="#E6E6E6", padx=10, pady=5)
button_frame.grid(row=2, column=0, padx=10, pady=5, sticky="ew")

output_frame = tk.LabelFrame(root, text="Output", bg="#E6E6E6", padx=10, pady=5)
output_frame.grid(row=3, column=0, padx=10, pady=5, sticky="ew")

# Create GUI components with background color
label_path = tk.Label(file_frame, text="Input File Path:", bg="#E6E6E6")
label_path.grid(row=0, column=0, padx=5, pady=5)

entry_path = tk.Entry(file_frame, width=50, bg="white")
entry_path.grid(row=0, column=1, padx=5, pady=5)

button_browse = tk.Button(file_frame, text="Browse", command=browse_file, bg="#4CAF50", fg="white")
button_browse.grid(row=0, column=2, padx=5, pady=5)

label_percentage = tk.Label(options_frame, text="Percentage of Data:", bg="#E6E6E6")
label_percentage.grid(row=0, column=0, padx=5, pady=5, sticky="w")

entry_percentage = tk.Entry(options_frame, bg="white")
entry_percentage.grid(row=0, column=1, padx=5, pady=5)

label_k = tk.Label(options_frame, text="Number of Clusters (K):", bg="#E6E6E6")
label_k.grid(row=1, column=0, padx=5, pady=5, sticky="w")

entry_k = tk.Entry(options_frame, bg="white")
entry_k.grid(row=1, column=1, padx=5, pady=5)

button_cluster = tk.Button(button_frame, text="Perform Clustering", command=perform_clustering, bg="#4CAF50", fg="white")
button_cluster.grid(row=0, column=0, padx=5, pady=5)

# Add a vertical scrollbar to the text output
scrollbar = tk.Scrollbar(output_frame, orient=tk.VERTICAL)
scrollbar.grid(row=0, column=1, sticky=tk.N+tk.S)

text_output = tk.Text(output_frame, height=20, width=70, yscrollcommand=scrollbar.set)
text_output.grid(row=0, column=0, padx=5, pady=5, sticky="nsew")

# Configure the scrollbar to work with the text output
scrollbar.config(command=text_output.yview)

# Tag configurations for text colors and bold font
text_output.tag_config("red_bold", foreground="red", font=("Helvetica", 10, "bold"))
text_output.tag_config("black", foreground="black")
text_output.tag_config("green_bold", foreground="green", font=("Helvetica", 10, "bold"))

# Start GUI event loop
root.mainloop()