In [2]:


# Create a DataFrame from a list of tuples
data = [("John", 25, "USA"),
        ("Emma", 28, "Canada"),
        ("Mike", 22, "UK")]

df = spark.createDataFrame(data, ["Name", "Age", "Country"])

# Show the DataFrame
df.show()

# Print the schema
df.printSchema()


+----+---+-------+
|Name|Age|Country|
+----+---+-------+
|John| 25|    USA|
|Emma| 28| Canada|
|Mike| 22|     UK|
+----+---+-------+

root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Country: string (nullable = true)



In [3]:

# Select specific columns
selected_df = df.select("Name", "Country")
selected_df.show()


+----+-------+
|Name|Country|
+----+-------+
|John|    USA|
|Emma| Canada|
|Mike|     UK|
+----+-------+



In [4]:

# Filter rows based on a condition
filtered_df = df.filter(df.Age > 25)
filtered_df.show()


+----+---+-------+
|Name|Age|Country|
+----+---+-------+
|Emma| 28| Canada|
+----+---+-------+



In [5]:

# Group by a column and perform aggregation
grouped_df = df.groupBy("Country").agg({"Age": "avg"})
grouped_df.show()


+-------+--------+
|Country|avg(Age)|
+-------+--------+
|    USA|    25.0|
| Canada|    28.0|
|     UK|    22.0|
+-------+--------+



In [6]:

# Join two DataFrames
data2 = [("John", "Sales"),
         ("Emma", "Marketing"),
         ("Mike", "Finance")]

df2 = spark.createDataFrame(data2, ["Name", "Department"])

joined_df = df.join(df2, on="Name", how="inner")
joined_df.show()

# Stop the SparkSession
spark.stop()


+----+---+-------+----------+
|Name|Age|Country|Department|
+----+---+-------+----------+
|Emma| 28| Canada| Marketing|
|John| 25|    USA|     Sales|
|Mike| 22|     UK|   Finance|
+----+---+-------+----------+



In [1]:
# Sample data
data = [("1", "a"), ("2", "b"), ("3", "c")]
columns = ["_1", "_2"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Create and save DataFrame as Hive table
df.write.mode("overwrite").saveAsTable("sample_hive_table")

# Query the Hive table
df3 = spark.sql("SELECT _1, _2 FROM sample_hive_table")
df3.show()



StatementMeta(, e717a139-02e0-44e8-9770-02c85defc0e5, 3, Finished, Available, Finished)

+---+---+
| _1| _2|
+---+---+
|  1|  a|
|  2|  b|
|  3|  c|
+---+---+

