In [65]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import os

In [66]:
spark = SparkSession.builder\
    .appName("Practice")\
    .master("local[*]")\
    .getOrCreate()

DATAFRAME MODE

In [67]:
data = [(1, "Luca"), (2, "Aiko"), (3, "Carlos"), (4, "Sofia"), (5, "Ethan")]


In [68]:
schema =["id", "name"]

In [69]:
df = spark.createDataFrame(data, schema)

In [70]:
df.select("*").show()

                                                                                

+---+------+
| id|  name|
+---+------+
|  1|  Luca|
|  2|  Aiko|
|  3|Carlos|
|  4| Sofia|
|  5| Ethan|
+---+------+



In [71]:
df.select("name").show()

+------+
|  name|
+------+
|  Luca|
|  Aiko|
|Carlos|
| Sofia|
| Ethan|
+------+



In [72]:
df.select(df.name).show()

+------+
|  name|
+------+
|  Luca|
|  Aiko|
|Carlos|
| Sofia|
| Ethan|
+------+



In [73]:
df.select(col("id").alias("emp_id")).show()

+------+
|emp_id|
+------+
|     1|
|     2|
|     3|
|     4|
|     5|
+------+



In [74]:
from pyspark.sql import Row 
new_row = Row(id=6, name="James")


In [75]:
new_df = spark.createDataFrame([new_row])

# 3. Combine the original 'df' with the 'new_df'
df_combined = df.union(new_df)

# 4. Show the result
df_combined.show()

+---+------+
| id|  name|
+---+------+
|  1|  Luca|
|  2|  Aiko|
|  3|Carlos|
|  4| Sofia|
|  5| Ethan|
|  6| James|
+---+------+



                                                                                

In [76]:
df_combined.select(col("name")).show()

+------+
|  name|
+------+
|  Luca|
|  Aiko|
|Carlos|
| Sofia|
| Ethan|
| James|
+------+



FILTER AND WHERE METHOD

In [77]:
df.filter(col("id") > 3).show()

+---+-----+
| id| name|
+---+-----+
|  4|Sofia|
|  5|Ethan|
+---+-----+



In [78]:
df.where(col("id") < 3).show()

+---+----+
| id|name|
+---+----+
|  1|Luca|
|  2|Aiko|
+---+----+



In [79]:
df.select("*",lit("test").alias("test_col")).show()

+---+------+--------+
| id|  name|test_col|
+---+------+--------+
|  1|  Luca|    test|
|  2|  Aiko|    test|
|  3|Carlos|    test|
|  4| Sofia|    test|
|  5| Ethan|    test|
+---+------+--------+



In [80]:
df.withColumn("new_col", lit("new_value")).show()

+---+------+---------+
| id|  name|  new_col|
+---+------+---------+
|  1|  Luca|new_value|
|  2|  Aiko|new_value|
|  3|Carlos|new_value|
|  4| Sofia|new_value|
|  5| Ethan|new_value|
+---+------+---------+



In [81]:
df.withColumn("id",col("id").cast("string")).printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)



In [82]:
df.show()

+---+------+
| id|  name|
+---+------+
|  1|  Luca|
|  2|  Aiko|
|  3|Carlos|
|  4| Sofia|
|  5| Ethan|
+---+------+



In [83]:
new_df.show()

+---+-----+
| id| name|
+---+-----+
|  6|James|
+---+-----+



SQL USE 

In [84]:
df.createOrReplaceTempView("people")   

In [85]:
spark.sql(""" select * from people """).show()

+---+------+
| id|  name|
+---+------+
|  1|  Luca|
|  2|  Aiko|
|  3|Carlos|
|  4| Sofia|
|  5| Ethan|
+---+------+



In [86]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)



In [87]:
df.columns

['id', 'name']

In [88]:
spark.stop()