In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, desc

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Big Data Analysis") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

# Load a large dataset (replace with your actual dataset path)
dataset_path = "path_to_your_large_dataset.csv"
df = spark.read.csv(dataset_path, header=True, inferSchema=True)

# Display the schema of the dataset
print("Dataset Schema:")
df.printSchema()

# Display a sample of the dataset
print("Sample Data:")
df.show(5)

# Basic statistics and insights
print("\nDataset Overview:")
print(f"Number of Rows: {df.count()}")
print(f"Number of Columns: {len(df.columns)}")

# Example Analysis 1: Count null values in each column
print("\nNull Value Counts:")
null_counts = df.select([(count(col(c).isNull()).alias(c)) for c in df.columns])
null_counts.show()

# Example Analysis 2: Compute averages of numerical columns (if applicable)
numeric_columns = [c[0] for c in df.dtypes if c[1] in ('int', 'double', 'float')]
if numeric_columns:
    print("\nAverage Values:")
    df.select([avg(col(c)).alias(f"avg_{c}") for c in numeric_columns]).show()

# Example Analysis 3: Top 10 most frequent values in a specific column
column_to_analyze = "your_column_name"  # Replace with a column from your dataset
if column_to_analyze in df.columns:
    print(f"\nTop 10 Most Frequent Values in {column_to_analyze}:")
    df.groupBy(column_to_analyze).count().orderBy(desc("count")).show(10)
else:
    print(f"Column '{column_to_analyze}' not found in dataset.")

# Save processed data (optional)
output_path = "path_to_save_processed_data"
df.write.csv(output_path, header=True)

# Stop the Spark session
spark.stop()

# Insights
print("\nInsights:")
print("1. Analyzed dataset contains large-scale data processed efficiently with PySpark.")
print("2. Null value analysis and basic statistics provide an overview of data quality.")
print("3. Frequency analysis identifies key trends in categorical data.")


PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.