In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 

In [2]:
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

In [3]:
df= pd.read_csv('exoplanets.csv')

In [4]:
df.head()

Unnamed: 0,Planet Name,Mass,Radius,Temperature,Orbital Period,Stellar Temperature
0,BD-14 3065 b,3932.0,21.59,2001.0,4.288973,6935.0
1,CFHTWIR-Oph 98 b,2479.061575,20.848704,1800.0,8040000.0,2320.0
2,CoRoT-1 b,327.35,16.7,1898.0,1.508956,5950.0
3,CoRoT-10 b,874.0,10.87,600.0,13.2406,5075.0
4,CoRoT-11 b,740.51,16.03,1657.0,2.99433,6440.0


In [5]:
df.columns

Index(['Planet Name', 'Mass', 'Radius', 'Temperature', 'Orbital Period',
       'Stellar Temperature'],
      dtype='object')

In [6]:
df.isnull().sum().sum()

0

In [7]:
sc= StandardScaler()
X= df.drop(columns='Planet Name')

In [8]:
X_scaled= sc.fit_transform(X)

In [9]:
X_scaled

array([[ 5.16085073e+00,  2.20664503e+00,  1.58990198e+00,
        -3.28250999e-02,  1.63277740e+00],
       [ 3.07982530e+00,  2.08010617e+00,  1.22033779e+00,
         3.05286708e+01, -3.15876748e+00],
       [-2.04413531e-03,  1.37192434e+00,  1.40052332e+00,
        -3.28356673e-02,  6.10096640e-01],
       ...,
       [ 7.07876802e-02,  7.02782193e-01,  1.72319943e-01,
        -3.28254847e-02,  7.02042596e-02],
       [ 1.53208179e+00,  2.48192021e+00,  8.10323793e-01,
        -3.28270917e-02,  1.40955267e+00],
       [-1.48149670e-01,  1.14830062e+00,  1.11553601e+00,
        -3.28305160e-02,  9.21573014e-01]])

In [10]:
n_components = np.arange(2, 11)
bics = []

# Calculate BIC for each K
for n in n_components:
    gmm = GaussianMixture(n_components=n, random_state=42)
    gmm.fit(X_scaled)
    bics.append(gmm.bic(X_scaled))

# Find the elbow point by looking for the biggest drop in the BIC score
# This identifies the point of maximum curvature
bic_diffs = np.diff(bics, 2) # The second difference approximates the curvature
optimal_k = n_components[np.argmax(bic_diffs) + 1] # +1 to align indices

print(f"\nOptimal number of clusters found: K = {optimal_k}\n")


Optimal number of clusters found: K = 3



In [11]:
print(f"Training final model with {optimal_k} clusters...")
gmm = GaussianMixture(n_components=optimal_k, random_state=42)
gmm.fit(X_scaled)

# Assign a cluster label to each planet
cluster_labels = gmm.predict(X_scaled)
X['Cluster'] = cluster_labels

print("Successfully assigned planets to clusters.\n")


Training final model with 3 clusters...
Successfully assigned planets to clusters.



In [12]:
cluster_labels

array([1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [13]:
X

Unnamed: 0,Mass,Radius,Temperature,Orbital Period,Stellar Temperature,Cluster
0,3932.000000,21.590000,2001.0,4.288973e+00,6935.0,1
1,2479.061575,20.848704,1800.0,8.040000e+06,2320.0,2
2,327.350000,16.700000,1898.0,1.508956e+00,5950.0,1
3,874.000000,10.870000,600.0,1.324060e+01,5075.0,1
4,740.510000,16.030000,1657.0,2.994330e+00,6440.0,1
...,...,...,...,...,...,...
928,2.530000,1.240000,996.0,5.734745e-01,3542.0,0
929,6.270000,2.043000,789.0,6.001270e+00,4716.0,0
930,378.200000,12.780000,1230.0,4.187756e+00,5430.0,1
931,1398.452000,23.202630,1577.0,3.765001e+00,6720.0,1


In [14]:
# Calculate the average values for each feature within each cluster
cluster_profiles = X.groupby('Cluster').mean().round(2)

print("--- Cluster Profiles (Average Values) ---")
print(cluster_profiles)

--- Cluster Profiles (Average Values) ---
            Mass  Radius  Temperature  Orbital Period  Stellar Temperature
Cluster                                                                   
0          11.02    2.60       870.74           14.15              4929.72
1         548.50   12.91      1321.96           28.28              5672.47
2        2479.06   20.85      1800.00      8040000.00              2320.00


In [15]:
X.to_csv('exoplanets_clustered_final.csv', index=False)
print("\nSaved the final results with cluster assignments to 'exoplanets_clustered_final.csv'.")


Saved the final results with cluster assignments to 'exoplanets_clustered_final.csv'.


In [16]:
import joblib


In [19]:
joblib.dump(gmm,'gmm_model_1.joblib')

['gmm_model_1.joblib']

In [20]:
joblib.dump(sc,'scaler.joblib')

['scaler.joblib']

In [21]:
cluster_profiles.to_csv('cluster_profiles.csv')