<a href="https://colab.research.google.com/github/VictoriaUsman/Big-Data/blob/main/DataFrame_Operations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DataFrame Ops").getOrCreate()

In [10]:
data = [
    (1, "Ian", 25),
    (2, "Tristan", 35),
    (3, "Cultura", 42)
]

In [6]:
from pyspark.sql.types import StructField, StringType, IntegerType, StructType, BooleanType, FloatType


In [11]:
schema = StructType(fields=[
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])

In [12]:
df = spark.createDataFrame(data=data, schema=schema)

In [13]:
df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|    Ian| 25|
|  2|Tristan| 35|
|  3|Cultura| 42|
+---+-------+---+



showing 2

In [14]:
df.show(2)

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|    Ian| 25|
|  2|Tristan| 35|
+---+-------+---+
only showing top 2 rows


In [15]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [17]:
df.columns

['id', 'name', 'age']

In [18]:
df.describe().show()

+-------+---+-------+----------------+
|summary| id|   name|             age|
+-------+---+-------+----------------+
|  count|  3|      3|               3|
|   mean|2.0|   NULL|            34.0|
| stddev|1.0|   NULL|8.54400374531753|
|    min|  1|Cultura|              25|
|    max|  3|Tristan|              42|
+-------+---+-------+----------------+



In [23]:
df.select("id").show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
+---+



Filtering Data

In [24]:
df.filter(df.age > 30).show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  2|Tristan| 35|
|  3|Cultura| 42|
+---+-------+---+



In [26]:
df.where(df.name == "Ian").show()

+---+----+---+
| id|name|age|
+---+----+---+
|  1| Ian| 25|
+---+----+---+



sorting and ordering

In [28]:
df.orderBy(df.age.asc()).show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|    Ian| 25|
|  2|Tristan| 35|
|  3|Cultura| 42|
+---+-------+---+



In [38]:
data2 = [
  (1, "Victoria"),
  (2, "San_pedro"),
  (3, "Lagos")
]



In [52]:
schema2 = StructType(fields=[
    StructField("id", IntegerType(), True),
    StructField("Province", StringType(), True)
])

In [53]:
df2 = spark.createDataFrame(data=data2, schema=schema2)

In [54]:
df2.show()

+---+---------+
| id| Province|
+---+---------+
|  1| Victoria|
|  2|San_pedro|
|  3|    Lagos|
+---+---------+



In [55]:
joined_df = df.join(df2, "id")

In [56]:
joined_df.show()

+---+-------+---+---------+
| id|   name|age| Province|
+---+-------+---+---------+
|  1|    Ian| 25| Victoria|
|  2|Tristan| 35|San_pedro|
|  3|Cultura| 42|    Lagos|
+---+-------+---+---------+



In [57]:
spark.stop()