In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ExampleApp").master("local[*]").getOrCreate()
print("=" * 50)
print("1. INITIAL DATAFRAME")
print("=" * 50)
data = [("Alice", 34), ("Bob", 45), ("Cathy", 29)]
df = spark.createDataFrame(data, ["Name", "Age"])
df.show()
spark.stop()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/25 06:18:16 WARN Utils: Your hostname, Somnath, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/12/25 06:18:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/25 06:18:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


1. INITIAL DATAFRAME


                                                                                

+-----+---+
| Name|Age|
+-----+---+
|Alice| 34|
|  Bob| 45|
|Cathy| 29|
+-----+---+



In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ExampleApp").master("local[*]").getOrCreate()

# 1. Original Data
data = [("Alice", 34), ("Bob", 45), ("Cathy", 29)]
df = spark.createDataFrame(data, ["Name", "Age"])

# 2. New Data
new_data = [("David", 22), ("Eve", 31)]
new_df = spark.createDataFrame(new_data, ["Name", "Age"])

# 3. Combine them
df_combined = df.union(new_df)

print("=" * 50)
print("UPDATED DATAFRAME")
print("=" * 50)
df_combined.show()

spark.stop()

# FILE WILL SAVE IN OUTPUT_DATA FOLDER 

In [6]:
import os
import shutil
from pyspark.sql import SparkSession

# FORCE LOCAL ENVIRONMENT
os.environ['PYSPARK_PYTHON'] = '/home/somnath/all_env/pyspark-kafka-env-venv/bin/python'

spark = SparkSession.builder \
    .appName("ExampleApp") \
    .master("local[*]") \
    .config("spark.driver.host", "127.0.0.1") \
    .config("spark.hadoop.fs.defaultFS", "file:///") \
    .getOrCreate()

try:
    data = [("Alice", 34), ("Bob", 45)]
    df = spark.createDataFrame(data, ["Name", "Age"])

    # Path must start with file:// for some Spark/Hadoop versions on Linux
    output_path = "file:///home/somnath/my_vscode_project/output_data"
    
    # Use standard save
    df.write.mode("overwrite").csv(output_path, header=True)
    print("Successfully saved to output_data!")

except Exception as e:
    print(f"Detailed Error: {e}")

finally:
    spark.stop()

                                                                                

Successfully saved to output_data!


In [None]:
import os
import shutil
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# 1. SETUP
os.environ['PYSPARK_PYTHON'] = '/home/somnath/all_env/pyspark-kafka-env-venv/bin/python'

spark = SparkSession.builder \
    .appName("CustomNaming") \
    .master("local[*]") \
    .config("spark.driver.host", "127.0.0.1") \
    .config("spark.hadoop.fs.defaultFS", "file:///") \
    .getOrCreate()

try:
    # 2. CREATE DATA
    data = [("Alice", "Sales", 4500), ("Bob", "Sales", 3000), ("Cathy", "IT", 6000)]
    df = spark.createDataFrame(data, ["Name", "Dept", "Salary"])

    # 3. PATHS
    base_dir = "/home/somnath/my_vscode_project/output_data"
    temp_dir = os.path.join(base_dir, "temp_spark_out")
    final_file_name = "final_processed_data.csv"
    final_path = os.path.join(base_dir, final_file_name)

    # Clean up old data
    if os.path.exists(base_dir):
        shutil.rmtree(base_dir)
    os.makedirs(base_dir)

    # 4. SAVE TO TEMP FOLDER
    # We use coalesce(1) to ensure only ONE part file is created
    df.coalesce(1).write.mode("overwrite").csv(temp_dir, header=True)

    # 5. RENAME THE PART FILE TO YOUR CODE NAME
    # Find the file that starts with 'part-' inside the temp folder
    part_file = [f for f in os.listdir(temp_dir) if f.startswith("part-") and f.endswith(".csv")][0]
    
    # Move and rename it to the main output_data folder
    shutil.move(os.path.join(temp_dir, part_file), final_path)

    # 6. CLEAN UP
    shutil.rmtree(temp_dir)

    print("-" * 30)
    print(f"SUCCESS!")
    print(f"File created at: {final_path}")
    print("-" * 30)

except Exception as e:
    print(f"Error: {e}")

finally:
    spark.stop()

                                                                                

------------------------------
SUCCESS!
File created at: /home/somnath/my_vscode_project/output_data/final_processed_data.csv
------------------------------
