In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit

# Create a Spark session
spark = SparkSession.builder.appName("AddConstantColumn").getOrCreate()

# Sample DataFrame
data = [("Alice", 25), ("Bob", 30), ("Cathy", 28)]
columns = ["Name", "Age"]

df = spark.createDataFrame(data, columns)

df.show()

+-----+---+
| Name|Age|
+-----+---+
|Alice| 25|
|  Bob| 30|
|Cathy| 28|
+-----+---+



In [6]:
# Add a new column with a constant value
df_with_constant = df.withColumn("Country", lit("India"))

df_with_constant.show()

+-----+---+-------+
| Name|Age|Country|
+-----+---+-------+
|Alice| 25|  India|
|  Bob| 30|  India|
|Cathy| 28|  India|
+-----+---+-------+



In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

# Create a Spark session
spark = SparkSession.builder.appName("AddColumnFromList").getOrCreate()

# Sample DataFrame
data = [("Alice", 25), ("Bob", 30), ("Cathy", 28)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

# List of values to add as a new column
country_list = ["India", "China", "UK"]

# Align the list values with rows in the DataFrame using zipWithIndex
rdd_with_values = df.rdd.zipWithIndex().map(
    lambda row_index: Row(**row_index[0].asDict(), Country=country_list[row_index[1]])
)

# Create a new DataFrame with the additional column
df_with_country = spark.createDataFrame(rdd_with_values)

df_with_country.show()

+-----+---+-------+
| Name|Age|Country|
+-----+---+-------+
|Alice| 25|  India|
|  Bob| 30|  China|
|Cathy| 28|     UK|
+-----+---+-------+



df.rdd: This converts the DataFrame df into an RDD (Resilient Distributed Dataset). An RDD is a low-level distributed collection of objects in PySpark.
zipWithIndex(): This RDD operation returns a new RDD where each element is a tuple containing the original element and its index. For example, if the original RDD contains [(1, 2), (3, 4)], applying zipWithIndex() will result in [((1, 2), 0), ((3, 4), 1)].
map(lambda row_index: Row(**row_index[0].asDict(), Country=country_list[row_index[1]])): This operation processes each row from the zipWithIndex result:

    row_index[0]: This refers to the actual row data (e.g., ("Alice", 25)).
    asDict(): Converts the Row object into a dictionary (e.g., {"Name": "Alice", "Age": 25}).
    Country=country_list[row_index[1]]: This adds a new key-value pair (Country=country_name) to the dictionary. The value is taken from the country_list, using the index row_index[1] to match the row with a corresponding country from the list.
    Row(**...): This constructs a new Row object with the combined fields from the original data and the new country value.

The result is an RDD of Row objects, where each row now includes the original columns (Name, Age) and the new column (Country).

In [7]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder.appName("DataFrameExample").getOrCreate()

# Create DataFrame from a list of tuples
data = [("Alice", 25), ("Bob", 30), ("Cathy", 28)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

df.show()

+-----+---+
| Name|Age|
+-----+---+
|Alice| 25|
|  Bob| 30|
|Cathy| 28|
+-----+---+



In [8]:
data = [{"Name": "Alice", "Age": 25}, {"Name": "Bob", "Age": 30}, {"Name": "Cathy", "Age": 28}]
df = spark.createDataFrame(data)

df.show()

+---+-----+
|Age| Name|
+---+-----+
| 25|Alice|
| 30|  Bob|
| 28|Cathy|
+---+-----+



In [9]:
# Create an RDD
rdd = spark.sparkContext.parallelize([("Alice", 25), ("Bob", 30), ("Cathy", 28)])

# Convert RDD to DataFrame
df = rdd.toDF(["Name", "Age"])

df.show()

+-----+---+
| Name|Age|
+-----+---+
|Alice| 25|
|  Bob| 30|
|Cathy| 28|
+-----+---+



In [10]:
import pandas as pd

# Create a Pandas DataFrame
pandas_df = pd.DataFrame({"Name": ["Alice", "Bob", "Cathy"], "Age": [25, 30, 28]})

# Convert Pandas DataFrame to PySpark DataFrame
df = spark.createDataFrame(pandas_df)

df.show()

+-----+---+
| Name|Age|
+-----+---+
|Alice| 25|
|  Bob| 30|
|Cathy| 28|
+-----+---+



### RDD(Resilient Distributed System)

In [11]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession \
    .builder \
    .appName("Python Spark create RDD example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# Create an RDD using parallelize
rdd = spark.sparkContext.parallelize([
    (1, 2, 3, 'a b c'),
    (4, 5, 6, 'd e f'),
    (7, 8, 9, 'g h i')
])

# Convert the RDD to a DataFrame with column names
df = rdd.toDF(['col1', 'col2', 'col3', 'col4'])

# Show the DataFrame
df.show()


+----+----+----+-----+
|col1|col2|col3| col4|
+----+----+----+-----+
|   1|   2|   3|a b c|
|   4|   5|   6|d e f|
|   7|   8|   9|g h i|
+----+----+----+-----+



In [14]:
myData = spark.sparkContext.parallelize([(1,2), (3,4), (5,6), (7,8), (9,10)])
myData.collect()

[(1, 2), (3, 4), (5, 6), (7, 8), (9, 10)]