In [None]:
import pandas as pd
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df = pd.read_csv(r"C:\Users\david\Downloads\hygdata_v41.csv")
engine = create_engine('sqlite:///stars.db', echo=True)
sqlite_connection = engine.connect()
df.to_sql('stars', sqlite_connection, if_exists='replace', index=False)



In [None]:
query = "SELECT * FROM stars LIMIT 100;"
df_result = pd.read_sql(query, con=engine)
#Filtered top 100

In [None]:
print(df_result)

In [None]:
print(df['mag'].value_counts())  # See how many unique values exist


In [None]:
Q1 = df['mag'].quantile(0.10)
Q3 = df['mag'].quantile(0.90)
IQR = Q3 - Q1
lower_bound = Q1 -1.5 * IQR
upper_bound = Q3 +1.5 * IQR
df_filtered = df[(df['mag'] >= lower_bound) & (df['mag'] <= upper_bound)]
#filter outliers

In [None]:
plt.hist(df_filtered['mag'], bins=100, edgecolor='black')  # Use more bins for finer lines
plt.xlabel("Magnitude")
plt.ylabel("Frequency")
plt.title("Distribution of Star Magnitudes")
plt.show()

In [None]:
df_cleaned = df[['mag', 'spect', 'lum', 'dist','absmag']].dropna()
print(df_cleaned.shape)
df_sample=df_cleaned.sample(10)
sns.pairplot(df_sample[['mag', 'spect', 'lum', 'dist']])
plt.show()

In [None]:
plt.scatter(df_cleaned['mag'], df_cleaned['absmag'],s=.01)
plt.xlabel('Magnitude')
plt.ylabel('Absolute Magnitude')
plt.title('Magnitude vs. Absolute Magnitude')
plt.xlim(-10,20)
plt.show()

In [None]:
plt.scatter(df_cleaned['mag'], df_cleaned['dist'],s=.1)
plt.yscale('log')
plt.xlabel('Magnitude')
plt.ylabel('Absolute Magnitude')
plt.title('Magnitude vs. Distance (Log Scale)')
plt.show()

In [None]:
df_spec=df_cleaned[['mag', 'lum', 'dist','absmag']]
correlation_matrix = df_spec.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()
correlation_value = df['mag'].corr(df['dist'])
print(f"Pearson correlation between mag and dist: {correlation_value}")


In [None]:
print(df.columns)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
dfl=df[['lum', 'dist', 'ci']].dropna()
X_scaled = scaler.fit_transform(dfl)

In [None]:
df['velocity_total'] = (df[['vx', 'vy', 'vz']]**2).sum(axis=1)**0.5
df['position_r'] = (df[['x', 'y', 'z']]**2).sum(axis=1)**0.5


In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
clusters = kmeans.fit_predict(X_scaled)
dfl['cluster'] = clusters
sns.scatterplot(x='lum', y='dist', hue='cluster', data=dfl)
plt.yscale('log')
plt.xscale('log')
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df['x'], df['y'], df['z'], s=1)
plt.title("3D Spatial Distribution of Stars")
plt.show()

In [None]:
import plotly.express as px
import plotly.io as pio


pio.renderers.default = 'browser' 

fig = px.scatter_3d(
    df, x='x', y='y', z='z',
    color='mag',  
    opacity=0.7,
    title='Interactive 3D Spatial Distribution of Stars'
)
fig.show()

In [None]:
sqlite_connection.close()
engine.dispose()
