In [0]:
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]

In [0]:
df = spark.createDataFrame(data)
df.show()

In [0]:
display(df.select("_1"))

In [0]:
df_renamed = df.withColumnRenamed("_1", "name").withColumnRenamed("_2", "age")

In [0]:

display(df_renamed)


In [0]:
from pyspark.sql import Row
data2 = [Row(name="Alice", age=25), Row(name="Bob", age=30), Row(name="Charlie", age=35)]
df2 = spark.createDataFrame(data2)

In [0]:
df2.display()

In [0]:
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
df3 = spark.createDataFrame(data, schema=["name", "age"])
df3.show()

In [0]:
df3.select("name").show()

In [0]:
df3.select(df3.name).show()

In [0]:
df3.select(df3["name"]).show()

In [0]:
from pyspark.sql.functions import col
df3.select(col("name")).show()

In [0]:
df3.select(col("name").alias("first_name")).show()

In [0]:
df3.select(col("name").like("A%")).show()

In [0]:
data = [
    (1, "Alice", 25, "Seattle"),
    (2, "Bob", 30, "Portland"),
    (3, "Charlie", 35, "San Francisco"),
    (4, "David", 28, "Los Angeles"),
    (5, "Eva", 22, "New York"),
    (6, "Frank", 33, "Chicago"),
    (7, "Grace", 27, "Houston"),
    (8, "Hannah", 31, "Boston"),
    (9, "Ian", 29, "Denver"),
    (10, "Jane", 26, "Miami")
]

# Define column names
columns = ["id", "name", "age", "city"]

# Create DataFrame
df = spark.createDataFrame(data, schema=columns)

# Show the DataFrame
df.show()

In [0]:
df.select(*[col(df.columns[1]), col(df.columns[2]), col(df.columns[3])]).show() 

In [0]:
df.select(df.columns[1:3]).limit(3).show() 

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, FloatType, StringType

# Import SparkFiles to access downloaded file
from pyspark import SparkFiles

# Add the file from the URL to SparkFiles
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
dbutils.fs.rm("/tmp/iris.data", True)  # Remove if already exists to avoid conflicts
dbutils.fs.cp(url, "dbfs:/tmp/iris.data")  # This may not work directly for HTTP URL in some environments

# Instead, use SparkContext to add the file from URL
spark.sparkContext.addFile(url)

# Get the local path of the downloaded file
local_path = SparkFiles.get("iris.data")

# Define schema since the file has no header
schema = StructType([
    StructField("sepal_length", FloatType(), True),
    StructField("sepal_width", FloatType(), True),
    StructField("petal_length", FloatType(), True),
    StructField("petal_width", FloatType(), True),
    StructField("class", StringType(), True)
])

# Read the CSV file with no header
df = spark.read.csv(local_path, schema=schema, header=False)

# Show the DataFrame
df.show(5)

In [0]:
file1_path = "/FileStore/tables/weather*.csv"
df = spark.read.csv(file1_path, header=True, inferSchema=True)
df.display()

In [0]:
address = [(1,"14851 Jeffrey Rd","DE"),
    (2,"43421 Margarita St","NY"),
    (3,"13111 Siemon Ave","CA")]
df =spark.createDataFrame(address,["id","address","state"])
display(df)

In [0]:
from pyspark.sql.functions import regexp_replace
df.withColumn('address', regexp_replace('address', 'Rd', 'Road')) \
  .show(truncate=False)

In [0]:
from pyspark.sql.functions import lit
new_df = df.withColumn("Country", lit("USA"))

new_df.display()


In [0]:
# 1. Equal / Not equal
#df.filter(col("gender") == "M").show()
#df.filter(col("gender") != "M").show()
 
 
# 2. Greater than / Less than
#df.filter(col("age") > 30).show()
#df.filter(col("salary") < 50000).show()
 
 
# 3. Greater than or equal / Less than or equal
#df.filter(col("age") >= 25).show()
#df.filter(col("salary") <= 40000).show()
 
# 4. IN / NOT IN
#df.filter(col("name").isin("Alice", "Bob")).show()
#df.filter(~col("name").isin("Charlie", "Diana")).show()
 
# 5. BETWEEN
#df.filter(col("age").between(25, 30)).show()
 
# 6. NULL / NOT NULL
#df.filter(col("age").isNull()).show()
#df.filter(col("salary").isNotNull()).show()
 
 
# 7. Startswith / Endswith / Contains
#df.filter(col("name").startswith("A")).show()
#df.filter(col("name").endswith("a")).show()
#df.filter(col("name").contains("ar")).show()
 
# 8. Multiple conditions (AND / OR)
#df.filter((col("gender") == "F") & (col("age") > 25)).show()
#df.filter((col("age") < 25) | (col("salary") < 40000)).show()

In [0]:
path = "/FileStore/tables/testdata_sunil.csv"

test_df = spark.read.csv(path, header=True, inferSchema=True)
test_df.display()

In [0]:
test_df.schema

In [0]:
columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"]
test_df_renamed = test_df.withColumnRenamed("5.1", "sepal_length").withColumnRenamed("3.5", "sepal_width").withColumnRenamed("1.4", "petal_length").withColumnRenamed("0.2", "petal_width").withColumnRenamed("Iris-setosa", "class")

test_df_renamed.display()


In [0]:
new_df = test_df_renamed.withColumn("sepal_area", col("sepal_length") * col("sepal_width"))

new_df.display()

In [0]:
# Filter rows where class is 'Iris-setosa' and sepal_length > 5

filter_df = new_df.filter((col("class") == "Iris-setosa") & (col("sepal_length") > 5))
filter_df.display()

In [0]:
# Combine all – filter class 'Iris-virginica' with petal_width > 2, create petal_area, and select

filtered_df2 = new_df.filter((col("class") == "Iris-virginica") & (col("petal_width") > 2))
filtered_df23 = filtered_df2.withColumn("petal_area", col("petal_length") * col("petal_width"))

filtered_df23.select("class", "petal_area").display()