In [21]:
import pandas as pd
import numpy as np
import plotly.express as px
import os
from PIL import Image
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import plotly.subplots as sp
import plotly.graph_objs as go
import pickle

In [22]:
n_components = 7
data_path = '../Data/SMAI-Dataset-problem-4.3/colors.pkl'

In [23]:
data = pickle.load(open(data_path, 'rb'))
print(data[:5])

[[-2.31638050e+00  1.26969612e+02]
 [ 1.43003789e+00  1.27063470e+02]
 [-5.39587093e-01  1.25897475e+02]
 [ 8.82056170e-02  1.25327145e+02]
 [-1.41889010e+00  1.27049518e+02]]


## Computing likely Color Components
You can find the means,covariances and probabilities of likely components computed using gaussian mixture models along with plots of gaussian containing peak values of each likely component

In [24]:
from sklearn.mixture import GaussianMixture

model = GaussianMixture(n_components=n_components, covariance_type='full')
model.fit(data)

means = model.means_
covariances = model.covariances_
probs = model.weights_

In [25]:
from scipy.stats import multivariate_normal

def plot_gaussian(means , covariances , title = "Gaussian"):
    x, y = np.meshgrid(np.linspace(-500, 500, 100), np.linspace(-500, 500, 100))
    pos = np.dstack((x, y))
    pdf_values = multivariate_normal.pdf(pos, mean=means, cov=covariances)


    # Find the peak (mode) of the distribution
    peak_index = np.unravel_index(np.argmax(pdf_values), pdf_values.shape)
    peak_x = x[peak_index]
    peak_y = y[peak_index]
    peak_value = pdf_values[peak_index]

    fig = go.Figure(data=[
        go.Surface(z=pdf_values, x=x, y=y)
    ])

    fig.add_trace(go.Scatter3d(
    x=[peak_x],
    y=[peak_y],
    z=[peak_value],
    mode='markers',
    marker=dict(
        size=8,
        color='red',
        symbol='circle',
        opacity=1
    ),
    text=f'Peak: ({peak_x:.2f}, {peak_y:.2f})',
    name='Peak'
))

    # Set the layout and labels
    fig.update_layout(
        title=title,
        scene=dict(
            xaxis_title='X-axis',
            yaxis_title='Y-axis',
            zaxis_title='PDF Value',
        )
    )

    # Show the plot
    fig.show()


for i in range(means.shape[0]):
    title = "Gaussian plot for " + str(i+1) + "th gaussian with prob = " + str(probs[i]) 
    plot_gaussian(means=means[i] , covariances=covariances[i] , title=title)

## Generating Dataset

In [33]:
def generate_dataset(n_components , means , covariances):
    dataset = []
    for i in range(n_components):
        dataset.append(np.random.multivariate_normal(means[i], covariances[i], 15))
    dataset = np.row_stack(dataset)
    return np.array(dataset)

In [34]:
sample_dataset = generate_dataset(n_components , means , covariances)

In [38]:
trace1 = go.Scatter(x=data[:,0], y=data[: ,1], mode='markers', name='Original Dataset')
trace2 = go.Scatter(x=sample_dataset[: , 0], y=sample_dataset[: , 1], mode='markers', name='Generated Dataset')

# Create a Figure object and add the traces
fig = go.Figure(data=[trace1, trace2])

# Customize the layout (optional)
fig.update_layout(
    title='Plotting Original dataset and Generated Points',
    xaxis_title='X-axis',
    yaxis_title='Y-axis'
)

# Show the plot
fig.show()

The points are almost identical