# Part-1 : To develop your version K-Means using the algorithm
- Implemented my version of K-Means in KMeans.py file

# 1. Import required libraries
- Importing all the necessary libraries required

In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from KMeans import KMeans
from sklearn.datasets import make_blobs

ImportError: cannot import name 'Cluster' from 'cluster' (/Users/ruchamaslekar/kmeans-ruchamaslekar/cluster.py)

# 2. Input data
- These data points represent coordinates in a 2D space.
- The goal is to apply the K-Means algorithm to cluster these points into distinct groups.
- Each sublist in X corresponds to the (x, y) coordinates of an instance.
- This sample data has 8 instances for demonstration purposes.

In [None]:
X = [[0, 0], [2, 2], [0, 2], [2, 0], [10, 10], [8, 8], [10, 8], [8, 10]]

# 3. Applying K-Means Clustering
- Initializing a KMeans object with k=2, indicating the desired number of clusters.
- Performing K-Means clustering on the dataset X.
- The result includes cluster labels assigned to each data point (cluster_labels)
  and the coordinates of the cluster centroids (centroids).

In [None]:
kmeans = KMeans(k=2)
labels, centroids = kmeans.fit(X)

# 4. Expected output
- Printing the expected cluster labels for the given dataset X.
- Printing the expected coordinates of cluster centroids after K-Means clustering.

In [None]:
expected_labels = [0, 0, 0, 0, 1, 1, 1, 1]
expected_centroids = [[1, 1], [9, 9]]

print("Expected lables:", expected_labels)
print("Expected centroids:", expected_centroids)

# Part-2 : Performance Analysis

# 1. Generating My Cluster Assignments
- Using make_blobs to generate 700 instances of data points with 4 clusters in 2-dimensional space.
- Parameters: n_samples=700 (number of instances), centers=4 (number of clusters),
  cluster_std=0.60 (standard deviation of each cluster), random_state=0 (seed for reproducibility).
- Displaying the generated data points in X.

In [None]:
X, My_cluster_assignments = make_blobs(n_samples=700, centers=4, cluster_std=0.60, random_state=0)
X

# 2. Printing cluster_assignments

In [None]:
My_cluster_assignments

# 3. My custom KMeans implementation
- Applying K-Means clustering with k=4 to the dataset X.
- Displaying the coordinates of cluster centroids after clustering.

In [None]:
kmeans = KMeans(k=4)
labels, centroids = kmeans.fit(X)
centroids

# 4.Scikit-learn KMeans implementation
- Importing necessary library
- Applying K-Means clustering with k=4 to the dataset X.
- Displaying the kmeans_sklearn after clustering.

In [None]:
from sklearn.cluster import KMeans
kmeans_sklearn = KMeans(n_clusters=4, random_state=0)
sklearn_cluster_assignments = kmeans_sklearn.fit_predict(X)

# 5. Comparison of Cluster Assignments: Custom KMeans vs. Scikit-learn KMeans vs. Actual Clusters
- Creating a 12x4 figure.
- Setting up a 1x3 grid.
- Actual clusters, color-coded by My_cluster_assignments.
- My KMeans clusters, color-coded by your custom assignments.
- Scikit-learn KMeans clusters, color-coded by scikit-learn assignments.
- Displaying the figure with three subplots.

In [None]:
plt.figure(figsize=(13, 5))

# Plot for 'labels'
plt.subplot(1, 3, 1)
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=labels)
plt.title('Actual Clusters')
plt.legend()

# Plot for 'My_cluster_assignments'
plt.subplot(1, 3, 2)
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=My_cluster_assignments)
plt.title('My KMeans Clusters')
plt.legend()

# Plot for 'sklearn_cluster_assignments'
plt.subplot(1, 3, 3)
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=sklearn_cluster_assignments)
plt.title('Scikit-learn KMeans Clusters')
plt.legend()

plt.show()

# 6. Comparison of Cluster Assignments

In [None]:
comparison_my_vs_actual = np.sum(My_cluster_assignments == labels) / len(labels)
comparison_my_vs_sklearn = np.sum(My_cluster_assignments == sklearn_cluster_assignments) / len(sklearn_cluster_assignments)

print(f"Comparison (My KMeans vs Actual): {comparison_my_vs_actual:.2%}")
print(f"Comparison (My KMeans vs Scikit-learn KMeans): {comparison_my_vs_sklearn:.2%}")

# 7. V-Measure Comparison Between Custom K-Means and Scikit-Learn K-Means
- Calculating and printing the V-Measure scores and a metric for clustering evaluation.
- Comparing the clustering results of a custom K-Means implementation.

In [None]:
from sklearn.metrics import v_measure_score
v_measure_my_kmeans = v_measure_score(My_cluster_assignments, labels)
v_measure_sklearn_kmeans = v_measure_score(My_cluster_assignments,sklearn_cluster_assignments)
print(f'V-Measure (My custom K-Means): {v_measure_my_kmeans}')
print(f'V-Measure (Scikit-Learn K-Means): {v_measure_sklearn_kmeans}')

# Part-3.1 : Demonstration of Clustering Algorithms on Chicago Dataset

# 1. Fetching Chicago taxi data
- Observed that the dataframe has 101,788 entries, ranging from index 0 to 101,787.

In [None]:
chicago_taxi_df = pd.read_csv('Taxi_Trips__2013-2023_.csv')
chicago_taxi_df = chicago_taxi_df[['Pickup Centroid Latitude', 'Pickup Centroid Longitude']]
chicago_taxi_df

# 2. Displaying information about the Chicago Taxi DataFrame
- Observed that both columns are of the float64 data type.
- "Pickup Centroid Latitude" and "Pickup Centroid Longitude" have 95,657 non-null entries each, indicating the number of non-missing values.

In [None]:
chicago_taxi_df.info()

# 3.  Descriptive Statistics for Chicago Taxi Data
- Observed that he latitude values range from approximately 41.66 to 42.02, covering a span of about 0.36 degrees.
- The longitude values range from approximately -87.91 to -87.53, covering a span of about 0.38 degrees.

In [None]:
chicago_taxi_df.describe()

# 4.  Distribution Comparison of Pickup Centroid Latitude and Longitude
- The data has a relatively small spread around the mean, with a standard deviation of 0.06 for latitude.
- The distribution looks somewhat symmetrical, as the mean and median are close for latitude.
- The data has a moderate spread around the mean, with a standard deviation of 0.10 for longitude.
- The distribution looks somewhat symmetrical, as the mean and median are close fpr longitude.

In [None]:
plt.figure(figsize=(10, 5))

# Plot for 'Pickup Centroid Latitude'
plt.subplot(1, 2, 1)
sns.histplot(chicago_taxi_df['Pickup Centroid Latitude'], kde=True, color='red')
plt.xlabel('Pickup Centroid Latitude')
plt.ylabel('Frequency')
plt.title('Distribution of Pickup Centroid Latitude')

# Plot for 'Pickup Centroid Longitude'
plt.subplot(1, 2, 2)
sns.histplot(chicago_taxi_df['Pickup Centroid Longitude'], kde=True, color='green')
plt.xlabel('Pickup Centroid Longitude')
plt.ylabel('Frequency')
plt.title('Distribution of Pickup Centroid Longitude')

plt.tight_layout()

plt.show()

# 5. Correlation Heatmap for Chicago Taxi Data

In [None]:
heatmap_data = chicago_taxi_df[['Pickup Centroid Latitude', 'Pickup Centroid Longitude']]
correlation_matrix = heatmap_data.corr()

plt.figure(figsize=(6, 4))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Heatmap: Pickup Centroid Latitude and Longitude')
plt.show()

# 6. Scatter Plot 

In [None]:
sns.scatterplot(data=chicago_taxi_df, x=chicago_taxi_df['Pickup Centroid Latitude'], y=chicago_taxi_df['Pickup Centroid Longitude'])

# 7. Checking for undefined values in dataframe

In [None]:
chicago_taxi_df.isnull().sum()

# 8. Cleaning the dataframe
- Cleaning the dataframe by remvoing undefind values

In [None]:
cleaned_taxi_df = chicago_taxi_df.dropna(inplace=True)
cleaned_taxi_df

# 9. Standardize the features
- Importing StandardScaler
- Extracting features 'Pickup Centroid Latitude', 'Pickup Centroid Longitude'

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_taxi = scaler.fit_transform(chicago_taxi_df)

# 10. Answers for questions mentioned in assignment pdf
- I chose k-means for its simplicity and efficiency in handling numerical data, suitable for exploratory clustering.
- Handled missing values, standardized features using StandardScaler, and determined k through iterative experimentation as a part f pre-   processing
- Will give Visual representation of  k-means output with a scatter plot, color-coding data points by assigned clusters below.
- Will use silhouette score and Davies-Bouldin index; results align with expectations, providing valuable insights for further analysis.

# 11. Applying K-Means on Chicago taxi data

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

kmeans = KMeans(n_clusters=4, random_state=42)
kmeans_labels = kmeans.fit_predict(df_taxi)
kmeans_score = silhouette_score(df_taxi, kmeans_labels)
kmeans.fit(df_taxi)

# 12. Applying DBSCAN on Chicago taxi data

In [None]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score

dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(df_taxi)
dbscan_score = silhouette_score(df_taxi, dbscan_labels)

# 13. Applying Hierarchical on Chicago taxi data

In [None]:
hierarchical = AgglomerativeClustering(n_clusters=3)
hierarchical_labels = hierarchical.fit_predict(df_taxi)
hierarchical_score = silhouette_score(df_taxi, hierarchical_labels)

# 14. Visualization of K-Means, DBSCAn and Hierarchical
- Applying the k-means algorithm, DBSCAN and Hierarchical to preprocessed Chicago taxi data with scaled features.
- Visualizing the resulting clusters on a scatter plots, where each data point is color-coded based on its assigned cluster.

In [None]:
plt.scatter(df_taxi['Pickup Centroid Longitude'], df_taxi['Pickup Centroid Latitude'], c=kmeans_labels, cmap='viridis', edgecolor='k')
plt.title('K-Means Clusters')
plt.xlabel('Pickup Centroid Longitude')
plt.ylabel('Pickup Centroid Latitude')
plt.show()