<a href="https://colab.research.google.com/github/PendlimarriSivasankar/PS/blob/main/Pyspark_Scenario_11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("colab pyspark").getOrCreate()


In [2]:
from pyspark.sql.functions import lead, col, expr
from pyspark.sql.window import Window

data = [(1111, "2021-01-15", 10),
        (1111, "2021-01-16", 15),
        (1111, "2021-01-17", 30),
        (1112, "2021-01-15", 10),
        (1112, "2021-01-15", 20),
        (1112, "2021-01-15", 30)]

myschema = ["sensorid", "timestamp", "values"]

df = spark.createDataFrame(data, schema=myschema)
df.show()
d1 = Window.partitionBy("sensorid").orderBy("values")

finaldf = df.withColumn("nextvalues", lead("values", 1).over(d1)) \
    .filter(col("nextvalues").isNotNull()) \
    .withColumn("values", expr("nextvalues-values")) \
    .drop("nextvalues") \
    .orderBy(col("sensorid")).show()

+--------+----------+------+
|sensorid| timestamp|values|
+--------+----------+------+
|    1111|2021-01-15|    10|
|    1111|2021-01-16|    15|
|    1111|2021-01-17|    30|
|    1112|2021-01-15|    10|
|    1112|2021-01-15|    20|
|    1112|2021-01-15|    30|
+--------+----------+------+

+--------+----------+------+
|sensorid| timestamp|values|
+--------+----------+------+
|    1111|2021-01-15|     5|
|    1111|2021-01-16|    15|
|    1112|2021-01-15|    10|
|    1112|2021-01-15|    10|
+--------+----------+------+



In [3]:
##WITHCOLUMN 1


data="""{
    "id": 1,
    "institute": "zeyo",
    "trainer": "Sai",
    "zeyoAddress" :{
                        "permanentAddress" : "Hyderabad",
                        "temporaryAddress" : "chennai"
    }
}"""

df = spark.read.json(spark.sparkContext.parallelize([data]))

df.show()
df.printSchema()

flatdf = df.select(
    "id",
    "institute",
    "trainer",
    "zeyoAddress.permanentAddress",
    "zeyoAddress.temporaryAddress"
)

flatdf.show()
flatdf.printSchema()

withflatdf =(
    df.withColumn("permanentAddress",expr("zeyoAddress.permanentAddress"))
    .withColumn("temporaryAddress", expr("zeyoAddress.temporaryAddress"))
    .drop("zeyoAddress")
)

withflatdf.show()
withflatdf.printSchema()


+---+---------+-------+--------------------+
| id|institute|trainer|         zeyoAddress|
+---+---------+-------+--------------------+
|  1|     zeyo|    Sai|{Hyderabad, chennai}|
+---+---------+-------+--------------------+

root
 |-- id: long (nullable = true)
 |-- institute: string (nullable = true)
 |-- trainer: string (nullable = true)
 |-- zeyoAddress: struct (nullable = true)
 |    |-- permanentAddress: string (nullable = true)
 |    |-- temporaryAddress: string (nullable = true)

+---+---------+-------+----------------+----------------+
| id|institute|trainer|permanentAddress|temporaryAddress|
+---+---------+-------+----------------+----------------+
|  1|     zeyo|    Sai|       Hyderabad|         chennai|
+---+---------+-------+----------------+----------------+

root
 |-- id: long (nullable = true)
 |-- institute: string (nullable = true)
 |-- trainer: string (nullable = true)
 |-- permanentAddress: string (nullable = true)
 |-- temporaryAddress: string (nullable = true)

+-

In [4]:
# WITH COLUMN 2 FULL CODE


data="""{
    "id": 1,
    "institute": "zeyo",
    "trainer": "Sai",
    "zeyoAddress" :{
            "user":{
                        "permanentAddress" : "Hyderabad",
                        "temporaryAddress" : "chennai"
                        },
            "doorno" : 4
    }
}"""

df = spark.read.json(spark.sparkContext.parallelize([data]))


df.show()

df.printSchema()




flatdf = df.select(

                    "id",
                    "institute",
                    "trainer",
                    "zeyoAddress.user.permanentAddress",
                    "zeyoAddress.user.temporaryAddress",
                    "zeyoAddress.doorno"


)

flatdf.show()
flatdf.printSchema()



withflatdf =(

            df.withColumn("permanentAddress",expr("zeyoAddress.user.permanentAddress"))
              .withColumn("temporaryAddress", expr("zeyoAddress.user.temporaryAddress"))
              .withColumn("doorno",expr("zeyoAddress.doorno"))
              .drop("zeyoAddress")
)


withflatdf.show()
withflatdf.printSchema()

+---+---------+-------+--------------------+
| id|institute|trainer|         zeyoAddress|
+---+---------+-------+--------------------+
|  1|     zeyo|    Sai|{4, {Hyderabad, c...|
+---+---------+-------+--------------------+

root
 |-- id: long (nullable = true)
 |-- institute: string (nullable = true)
 |-- trainer: string (nullable = true)
 |-- zeyoAddress: struct (nullable = true)
 |    |-- doorno: long (nullable = true)
 |    |-- user: struct (nullable = true)
 |    |    |-- permanentAddress: string (nullable = true)
 |    |    |-- temporaryAddress: string (nullable = true)

+---+---------+-------+----------------+----------------+------+
| id|institute|trainer|permanentAddress|temporaryAddress|doorno|
+---+---------+-------+----------------+----------------+------+
|  1|     zeyo|    Sai|       Hyderabad|         chennai|     4|
+---+---------+-------+----------------+----------------+------+

root
 |-- id: long (nullable = true)
 |-- institute: string (nullable = true)
 |-- traine

In [5]:
####PANEL 1

!pip install -q pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Colab PySpark").getOrCreate()
sc = spark.sparkContext
from pyspark.sql.functions import *

###### PANEL 2 CODE

##WITHCOLUMN 1



data="""{

    "id": 1,

    "institute": "zeyo",

    "trainer": "Sai",

    "zeyoAddress" :{

          "permanentAddress" : "Hyderabad",

          "temporaryAddress" : "chennai"

    }

}"""



df = spark.read.json(sc.parallelize([data]))





df.show()



df.printSchema()









flatdf = df.select(



    "id",

    "institute",

    "trainer",

    "zeyoAddress.permanentAddress",

    "zeyoAddress.temporaryAddress"





)



flatdf.show()

flatdf.printSchema()







withflatdf =(



    df.withColumn("permanentAddress",expr("zeyoAddress.permanentAddress"))

    .withColumn("temporaryAddress", expr("zeyoAddress.temporaryAddress"))

    .drop("zeyoAddress")

)





withflatdf.show()

withflatdf.printSchema()


+---+---------+-------+--------------------+
| id|institute|trainer|         zeyoAddress|
+---+---------+-------+--------------------+
|  1|     zeyo|    Sai|{Hyderabad, chennai}|
+---+---------+-------+--------------------+

root
 |-- id: long (nullable = true)
 |-- institute: string (nullable = true)
 |-- trainer: string (nullable = true)
 |-- zeyoAddress: struct (nullable = true)
 |    |-- permanentAddress: string (nullable = true)
 |    |-- temporaryAddress: string (nullable = true)

+---+---------+-------+----------------+----------------+
| id|institute|trainer|permanentAddress|temporaryAddress|
+---+---------+-------+----------------+----------------+
|  1|     zeyo|    Sai|       Hyderabad|         chennai|
+---+---------+-------+----------------+----------------+

root
 |-- id: long (nullable = true)
 |-- institute: string (nullable = true)
 |-- trainer: string (nullable = true)
 |-- permanentAddress: string (nullable = true)
 |-- temporaryAddress: string (nullable = true)

+-

In [6]:
## COMPLEX ARRAY EXAMPLE 1

data="""{
  "country" : "US",
  "version" : "0.6",
  "Actors": [
    {
      "name": "Tom Cruise",
      "age": 56,
      "BornAt": "Syracuse, NY",
      "Birthdate": "July 3, 1962",
      "photo": "https://jsonformatter.org/img/tom-cruise.jpg",
      "wife": null,
      "weight": 67.5,
      "hasChildren": true,
      "hasGreyHair": false,
      "picture": {
                    "large": "https://randomuser.me/api/portraits/men/73.jpg",
                    "medium": "https://randomuser.me/api/portraits/med/men/73.jpg",
                    "thumbnail": "https://randomuser.me/api/portraits/thumb/men/73.jpg"
                }
    },
    {
      "name": "Robert Downey Jr.",
      "age": 53,
      "BornAt": "New York City, NY",
      "Birthdate": "April 4, 1965",
      "photo": "https://jsonformatter.org/img/Robert-Downey-Jr.jpg",
      "wife": "Susan Downey",
      "weight": 77.1,
      "hasChildren": true,
      "hasGreyHair": false,
      "picture": {
                    "large": "https://randomuser.me/api/portraits/men/78.jpg",
                    "medium": "https://randomuser.me/api/portraits/med/men/78.jpg",
                    "thumbnail": "https://randomuser.me/api/portraits/thumb/men/78.jpg"
                }
    }
  ]
}"""

df = spark.read.json(sc.parallelize([data]))


df.show()
df.printSchema()



flat1 = df.selectExpr(

    "explode(Actors) as Actors",
    "country",
    "version"


)

flat1.show()
flat1.printSchema()




flat2 = flat1.selectExpr(

    "Actors.Birthdate",
    "Actors.BornAt",
    "Actors.age",
    "Actors.hasChildren",
    "Actors.hasGreyHair",
    "Actors.name",
    "Actors.photo",
    "Actors.picture.large",
    "Actors.picture.medium",
    "Actors.picture.thumbnail",
    "Actors.weight",
    "Actors.wife",
    "country",
    "version"


)


flat2.show()
flat2.printSchema()


+--------------------+-------+-------+
|              Actors|country|version|
+--------------------+-------+-------+
|[{July 3, 1962, S...|     US|    0.6|
+--------------------+-------+-------+

root
 |-- Actors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Birthdate: string (nullable = true)
 |    |    |-- BornAt: string (nullable = true)
 |    |    |-- age: long (nullable = true)
 |    |    |-- hasChildren: boolean (nullable = true)
 |    |    |-- hasGreyHair: boolean (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- photo: string (nullable = true)
 |    |    |-- picture: struct (nullable = true)
 |    |    |    |-- large: string (nullable = true)
 |    |    |    |-- medium: string (nullable = true)
 |    |    |    |-- thumbnail: string (nullable = true)
 |    |    |-- weight: double (nullable = true)
 |    |    |-- wife: string (nullable = true)
 |-- country: string (nullable = true)
 |-- version: string (nullab

In [8]:
## COMPLEX ARRAY EXAMPLE 2

data="""{
   "name":"John",
   "age":30,
   "cars":[
      {
         "name":"Ford",
         "models":[
            "Fiesta",
            "Focus",
            "Mustang"
         ]
      },
      {
         "name":"BMW",
         "models":[
            "320",
            "X3",
            "X5"
         ]
      },
      {
         "name":"Fiat",
         "models":[
            "500",
            "Panda"
         ]
      }
   ]
}"""

df = spark.read.json(sc.parallelize([data]))


df.show()
df.printSchema()



flat1 = df.selectExpr(

    "age",
    "explode(cars) as cars",
    "name"

)

flat1.show()

flat1.printSchema()




flat2 = flat1.selectExpr(

    "age",
    "cars.models",
    "cars.name as cars_name",
    "name"

)

flat2.show()
flat2.printSchema()


finalflat = flat2.selectExpr(

    "age",
    "explode(models) as models",
    "cars_name",
    "name"

)

finalflat.show()
finalflat.printSchema()


+---+--------------------+----+
|age|                cars|name|
+---+--------------------+----+
| 30|[{[Fiesta, Focus,...|John|
+---+--------------------+----+

root
 |-- age: long (nullable = true)
 |-- cars: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- models: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |-- name: string (nullable = true)

+---+--------------------+----+
|age|                cars|name|
+---+--------------------+----+
| 30|{[Fiesta, Focus, ...|John|
| 30|{[320, X3, X5], BMW}|John|
| 30|{[500, Panda], Fiat}|John|
+---+--------------------+----+

root
 |-- age: long (nullable = true)
 |-- cars: struct (nullable = true)
 |    |-- models: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- name: string (nullable = true)
 |-- name: string (nullable = true)

+---+--------------------+---------+----+
|age|           

In [9]:
data="""{
  "id": 1,
  "name": "Sai",
  "address": {
    "street": "123 Main St",
    "city": "Hyderabad",
    "zip": "500081"
  },
  "contacts": [
    {
      "type": "email",
      "value": "sai@example.com"
    },
    {
      "type": "phone",
      "value": "+91-9876543210"
    }
  ]
}"""

df = spark.read.json(sc.parallelize([data]))


df.show()
df.printSchema()


arrayflat = df.withColumn("contacts",expr("explode(contacts)"))



arrayflat.show()
arrayflat.printSchema()



finalflat = arrayflat.select(

    "address.city",
    "address.street",
    "address.zip",
    "contacts.type",
    "contacts.value",
    "id",
    "name"


)



finalflat.show()
finalflat.printSchema()


+--------------------+--------------------+---+----+
|             address|            contacts| id|name|
+--------------------+--------------------+---+----+
|{Hyderabad, 123 M...|[{email, sai@exam...|  1| Sai|
+--------------------+--------------------+---+----+

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- street: string (nullable = true)
 |    |-- zip: string (nullable = true)
 |-- contacts: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- type: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)

+--------------------+--------------------+---+----+
|             address|            contacts| id|name|
+--------------------+--------------------+---+----+
|{Hyderabad, 123 M...|{email, sai@examp...|  1| Sai|
|{Hyderabad, 123 M...|{phone, +91-98765...|  1| Sai|
+--------------------+--------------------+---+----+

In [10]:
import urllib
import ssl

urldata = urllib.request.urlopen("https://randomuser.me/api/0.8/?results=10",context=ssl._create_unverified_context()).read().decode('utf-8')


print(urldata)


df   = spark.read.json(sc.parallelize([urldata]))

df.show()
df.printSchema()



explodedf = df.withColumn("results" , expr("explode(results)"))

explodedf.show()
explodedf.printSchema()


finalflatten = explodedf.select(

    "nationality",
    "results.user.cell",
    "results.user.dob",
    "results.user.email",
    "results.user.gender",
    "results.user.location.city",
    "results.user.location.state",
    "results.user.location.street",
    "results.user.location.zip",
    "results.user.md5",
    "results.user.name.first",
    "results.user.name.last",
    "results.user.name.title",
    "results.user.password",
    "results.user.phone",
    "results.user.picture.large",
    "results.user.picture.medium",
    "results.user.picture.thumbnail",
    "results.user.registered",
    "results.user.salt",
    "results.user.sha1",
    "results.user.sha256",
    "results.user.username",
    "seed",
    "version"
)

finalflatten.show()
finalflatten.printSchema()

{
    "results": [
        {
            "user": {
                "gender": "male",
                "name": {
                    "title": "mr",
                    "first": "estéban",
                    "last": "gerard"
                },
                "location": {
                    "street": "2363 avenue debrousse",
                    "city": "villeurbanne",
                    "state": "pyrénées-orientales",
                    "zip": 50659
                },
                "email": "estéban.gerard@example.com",
                "username": "organicgorilla630",
                "password": "beast",
                "salt": "pzj2PTJF",
                "md5": "4d93f9deb941452b28ba52c624c41f3b",
                "sha1": "fc2192131ecd654a51ba44c98a7cc0f63983134e",
                "sha256": "3a0bbdc7fbc370c63cd0ff54d142a32044745e81cbf209bfade8e874ccfcb3b3",
                "registered": 1058442709,
                "dob": 415258686,
                "phone": "02-02-32-08-95",
        