# Astronomy Data Analysis with AI Techniques

This notebook demonstrates how to use AI algorithms to analyze astronomical data. We'll classify galaxies and detect exoplanets using machine learning techniques.

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.datasets import make_classification

In [2]:
# Generate synthetic dataset representing galaxy data
def generate_galaxy_data(n_samples=10000):
    np.random.seed(42)
    X, y = make_classification(n_samples=n_samples, n_features=10, n_informative=5, n_redundant=2, n_clusters_per_class=1, n_classes=3)
    columns = [f'feature_{i}' for i in range(X.shape[1])]
    df = pd.DataFrame(X, columns=columns)
    df['label'] = y
    return df

# Load and preprocess the dataset
df_galaxy = generate_galaxy_data()
X_galaxy = df_galaxy.drop('label', axis=1)
y_galaxy = df_galaxy['label']

# Split the dataset into training and testing sets
X_galaxy_train, X_galaxy_test, y_galaxy_train, y_galaxy_test = train_test_split(X_galaxy, y_galaxy, test_size=0.3, random_state=42)

# Standardize the features
scaler_galaxy = StandardScaler()
X_galaxy_train_scaled = scaler_galaxy.fit_transform(X_galaxy_train)
X_galaxy_test_scaled = scaler_galaxy.transform(X_galaxy_test)

In [3]:
# Dimensionality reduction using PCA
pca_galaxy = PCA(n_components=2)
X_galaxy_train_pca = pca_galaxy.fit_transform(X_galaxy_train_scaled)
X_galaxy_test_pca = pca_galaxy.transform(X_galaxy_test_scaled)

# Visualization of the data using PCA components
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_galaxy_train_pca[:, 0], y=X_galaxy_train_pca[:, 1], hue=y_galaxy_train, palette='viridis', s=50)
plt.title('PCA of Galaxy Data')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Galaxy Type')
plt.show()

In [4]:
# Clustering using KMeans to identify galaxy types
kmeans_galaxy = KMeans(n_clusters=3, random_state=42)
y_galaxy_train_clusters = kmeans_galaxy.fit_predict(X_galaxy_train_pca)
y_galaxy_test_clusters = kmeans_galaxy.predict(X_galaxy_test_pca)

# Visualization of the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_galaxy_train_pca[:, 0], y=X_galaxy_train_pca[:, 1], hue=y_galaxy_train_clusters, palette='viridis', s=50)
plt.title('KMeans Clustering of Galaxy Data')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster')
plt.show()

In [5]:
# Classification using Random Forest to identify galaxy types
clf_galaxy = RandomForestClassifier(n_estimators=100, random_state=42)
clf_galaxy.fit(X_galaxy_train_scaled, y_galaxy_train)

# Predictions and evaluation
y_galaxy_pred = clf_galaxy.predict(X_galaxy_test_scaled)
print(classification_report(y_galaxy_test, y_galaxy_pred, target_names=['Galaxy Type 1', 'Galaxy Type 2', 'Galaxy Type 3']))
print(confusion_matrix(y_galaxy_test, y_galaxy_pred))

In [6]:
# Feature importance
feature_importances_galaxy = pd.Series(clf_galaxy.feature_importances_, index=X_galaxy.columns)
feature_importances_galaxy.sort_values(ascending=False, inplace=True)

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances_galaxy, y=feature_importances_galaxy.index)
plt.title('Feature Importances in Galaxy Classification')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.show()

In [7]:
# Generate synthetic dataset representing exoplanet data
def generate_exoplanet_data(n_samples=10000):
    np.random.seed(42)
    X, y = make_classification(n_samples=n_samples, n_features=10, n_informative=5, n_redundant=2, n_clusters_per_class=1, n_classes=2)
    columns = [f'feature_{i}' for i in range(X.shape[1])]
    df = pd.DataFrame(X, columns=columns)
    df['label'] = y
    return df

# Load and preprocess the dataset
df_exoplanet = generate_exoplanet_data()
X_exoplanet = df_exoplanet.drop('label', axis=1)
y_exoplanet = df_exoplanet['label']

# Split the dataset into training and testing sets
X_exoplanet_train, X_exoplanet_test, y_exoplanet_train, y_exoplanet_test = train_test_split(X_exoplanet, y_exoplanet, test_size=0.3, random_state=42)

# Standardize the features
scaler_exoplanet = StandardScaler()
X_exoplanet_train_scaled = scaler_exoplanet.fit_transform(X_exoplanet_train)
X_exoplanet_test_scaled = scaler_exoplanet.transform(X_exoplanet_test)

In [8]:
# Classification using Random Forest to detect exoplanets
clf_exoplanet = RandomForestClassifier(n_estimators=100, random_state=42)
clf_exoplanet.fit(X_exoplanet_train_scaled, y_exoplanet_train)

# Predictions and evaluation
y_exoplanet_pred = clf_exoplanet.predict(X_exoplanet_test_scaled)
print(classification_report(y_exoplanet_test, y_exoplanet_pred, target_names=['No Exoplanet', 'Exoplanet']))
print(confusion_matrix(y_exoplanet_test, y_exoplanet_pred))

In [9]:
# Feature importance
feature_importances_exoplanet = pd.Series(clf_exoplanet.feature_importances_, index=X_exoplanet.columns)
feature_importances_exoplanet.sort_values(ascending=False, inplace=True)

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances_exoplanet, y=feature_importances_exoplanet.index)
plt.title('Feature Importances in Exoplanet Detection')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.show()