In [1]:
import sys

from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    TimestampType,
)

In [None]:
# Initialize Spark and read CSV
spark = SparkSession.builder.master("local[*]").appName("StellarClassifier").getOrCreate()
csv_path = "star_classification.csv"

# Read CSV with header and inferred schema
df = spark.read.csv(csv_path, header=True, inferSchema=True)

# Basic checks and preview
try:
    row_count = df.count()
except Exception as e:
    print("Could not count rows:", e)
    row_count = None

print(f"Loaded rows: {row_count} (path: {csv_path})")
df.printSchema()
df.show(10, truncate=False)

# Optional: try converting a small sample to pandas for quick inspection
try:
    pandas_df = df.limit(1000).toPandas()
    print("Converted to pandas, shape:", pandas_df.shape)
except Exception as e:
    print("Couldn't convert to pandas:", e)


Loaded rows: 100000 (path: star_classification.csv)
root
 |-- obj_ID: double (nullable = true)
 |-- alpha: double (nullable = true)
 |-- delta: double (nullable = true)
 |-- u: double (nullable = true)
 |-- g: double (nullable = true)
 |-- r: double (nullable = true)
 |-- i: double (nullable = true)
 |-- z: double (nullable = true)
 |-- run_ID: integer (nullable = true)
 |-- rerun_ID: integer (nullable = true)
 |-- cam_col: integer (nullable = true)
 |-- field_ID: integer (nullable = true)
 |-- spec_obj_ID: double (nullable = true)
 |-- class: string (nullable = true)
 |-- redshift: double (nullable = true)
 |-- plate: integer (nullable = true)
 |-- MJD: integer (nullable = true)
 |-- fiber_ID: integer (nullable = true)

+---------------------+----------------+------------------+--------+--------+--------+--------+--------+------+--------+-------+--------+---------------------+------+------------+-----+-----+--------+
|obj_ID               |alpha           |delta             |u       |

In [4]:
df.describe().show()

+-------+--------------------+-------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-----------------+--------+------------------+------------------+--------------------+------+------------------+------------------+------------------+-----------------+
|summary|              obj_ID|              alpha|            delta|                u|                 g|                 r|                 i|                 z|           run_ID|rerun_ID|           cam_col|          field_ID|         spec_obj_ID| class|          redshift|             plate|               MJD|         fiber_ID|
+-------+--------------------+-------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-----------------+--------+------------------+------------------+--------------------+------+------------------+------------------+------------------+-----------------+
|  coun

In [None]:
new_df = df.select()