In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('Practice').getOrCreate()

In [3]:
#StructField & StructType usage
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Department", StringType(), False)
])

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("StructType Example").getOrCreate()

data = [("Alice", 30, "HR"),
        ("Bob", 35, "Finance"),
        ("Charlie", 25, "IT")]  

df = spark.createDataFrame(data, schema=schema)

df.show()

+-------+---+----------+
|   Name|Age|Department|
+-------+---+----------+
|  Alice| 30|        HR|
|    Bob| 35|   Finance|
|Charlie| 25|        IT|
+-------+---+----------+



In [5]:
#nested structures 
StructName=StructType(
[
    StructField('First Name',StringType()),
    StructField('Last Name',StringType())
])
schema = StructType([
    StructField("Name", StructName, True),
    StructField("Age", IntegerType(), True),
    StructField("Department", StringType(), False)
])

In [6]:
data = [(("Varun","Kumar"), 30, "HR"),
        (("Agastya","Ram"), 35, "Finance"),
        (("Charlie",'Shenon'), 25, "IT")]  

df = spark.createDataFrame(data, schema=schema)

In [7]:
df.show()

+-----------------+---+----------+
|             Name|Age|Department|
+-----------------+---+----------+
|   {Varun, Kumar}| 30|        HR|
|   {Agastya, Ram}| 35|   Finance|
|{Charlie, Shenon}| 25|        IT|
+-----------------+---+----------+



In [8]:
df.printSchema()

root
 |-- Name: struct (nullable = true)
 |    |-- First Name: string (nullable = true)
 |    |-- Last Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Department: string (nullable = false)



In [25]:
#pass array to the column
data = [("Alice", [80,70] ),
        ("Bob", [100,90]),
        ("Charlie", [65,50])]
#ArrayType() function can also be used ,while creating schema
schema=['Name','Score']

df = spark.createDataFrame(data, schema=schema)
df.show()

+-------+---------+
|   Name|    Score|
+-------+---------+
|  Alice| [80, 70]|
|    Bob|[100, 90]|
|Charlie| [65, 50]|
+-------+---------+



In [26]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Score: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [29]:
import  pyspark.sql.functions as F
df.withColumn('Maths',F.col('Score')[0]).withColumn('Computer Science',F.col('Score')[1]).show()
#explode function and array_contains
df.withColumn('splitmarks',F.explode(F.col('Score'))).show()
df.withColumn('SCORED 100',F.array_contains(F.col('Score'),100)).show()

+-------+---------+-----+----------------+
|   Name|    Score|Maths|Computer Science|
+-------+---------+-----+----------------+
|  Alice| [80, 70]|   80|              70|
|    Bob|[100, 90]|  100|              90|
|Charlie| [65, 50]|   65|              50|
+-------+---------+-----+----------------+

+-------+---------+----------+
|   Name|    Score|splitmarks|
+-------+---------+----------+
|  Alice| [80, 70]|        80|
|  Alice| [80, 70]|        70|
|    Bob|[100, 90]|       100|
|    Bob|[100, 90]|        90|
|Charlie| [65, 50]|        65|
|Charlie| [65, 50]|        50|
+-------+---------+----------+

+-------+---------+----------+
|   Name|    Score|SCORED 100|
+-------+---------+----------+
|  Alice| [80, 70]|     false|
|    Bob|[100, 90]|      true|
|Charlie| [65, 50]|     false|
+-------+---------+----------+



In [14]:
data=[('Varun',40,60,50),('Karna',30,40,50)]
df2=spark.createDataFrame(data,schema=['name','m1','m2','m3'])
df2.show()

+-----+---+---+---+
| name| m1| m2| m3|
+-----+---+---+---+
|Varun| 40| 60| 50|
|Karna| 30| 40| 50|
+-----+---+---+---+



In [19]:
#combining columns into array
df2.withColumn('All',F.array(df2.m1,df2.m2,df2.m3)).withColumn('Total',df2.m1+df2.m2+df2.m3).show()

+-----+---+---+---+------------+-----+
| name| m1| m2| m3|         All|Total|
+-----+---+---+---+------------+-----+
|Varun| 40| 60| 50|[40, 60, 50]|  150|
|Karna| 30| 40| 50|[30, 40, 50]|  120|
+-----+---+---+---+------------+-----+



In [21]:
#maptype implementation 
from pyspark.sql.types import MapType,BooleanType,IntegerType
data=[('Varun',{'Age':23,'Weight':67,'Eligible':True}),('Naveen',{'Age':24,'Weight':55,'Eligible':True})]
schema_details = StructType([
    StructField("Age", IntegerType(), True),
    StructField("Weight", IntegerType(), True),
    StructField("Eligible", BooleanType(), False)
])
schema=StructType([
    StructField("Name", StringType(), True),
    StructField("Details", schema_details, True)
])

df=spark.createDataFrame(data,schema=schema)

In [22]:
df.show()

+------+--------------+
|  Name|       Details|
+------+--------------+
| Varun|{23, 67, true}|
|Naveen|{24, 55, true}|
+------+--------------+



In [23]:
df.show(truncate=False)

+------+--------------+
|Name  |Details       |
+------+--------------+
|Varun |{23, 67, true}|
|Naveen|{24, 55, true}|
+------+--------------+



In [26]:
#accessing key values using various method
df.withColumn('Age',df.Details['Age']).show()
df.select(df.Details.Weight).show()
df.select(df.Details.getItem('Age')).show()
#df.withColumn('Weight',df.Details.getItem('Weight')).show()
#df.withColumn('height',df.Details.getItem('height')).show()

+------+--------------+---+
|  Name|       Details|Age|
+------+--------------+---+
| Varun|{23, 67, true}| 23|
|Naveen|{24, 55, true}| 24|
+------+--------------+---+

+--------------+
|Details.Weight|
+--------------+
|            67|
|            55|
+--------------+

+-----------+
|Details.Age|
+-----------+
|         23|
|         24|
+-----------+



In [75]:
df.select(df.Details.Weight).show()

+---------------+
|Details[Weight]|
+---------------+
|             67|
|             55|
+---------------+



In [55]:
df.select('name',F.explode(df.Details)).show()

+------+------+-----+
|  name|   key|value|
+------+------+-----+
| Varun|Weight|   67|
| Varun|   Age|   23|
|Naveen|Weight|   55|
|Naveen|   Age|   24|
|Naveen|height|  170|
+------+------+-----+



In [70]:

df.select(F.map_keys(df.Details)).show(truncate=False)
df.select(F.map_values(df.Details)).show(truncate=False)

+---------------------+
|map_keys(Details)    |
+---------------------+
|[Weight, Age]        |
|[Weight, Age, height]|
+---------------------+

+-------------------+
|map_values(Details)|
+-------------------+
|[67, 23]           |
|[55, 24, 170]      |
+-------------------+



In [74]:
#column creation with specific value
df.withColumn('Diabetes',F.lit(1)).show()

+------+--------------------+--------+
|  Name|             Details|Diabetes|
+------+--------------------+--------+
| Varun|{Weight -> 67, Ag...|       1|
|Naveen|{Weight -> 55, Ag...|       1|
+------+--------------------+--------+



In [5]:
dfs=spark.read.csv('C:/Users/acer/Downloads/sample_data.csv',header=True,inferSchema=True)

In [6]:
dfs.show()

+---+------+------+---------+
| id|gender|salary|     name|
+---+------+------+---------+
|  1|     F| 56751| Jennifer|
|  2|     F| 60905|  Barbara|
|  3|     M| 90792|     John|
|  4|     M| 81428|   Robert|
|  5|     F|113324|    Sarah|
|  6|     F| 36599|  Barbara|
|  7|     M|115530|    James|
|  8|     F| 57885|    Susan|
|  9|     F| 54782|    Susan|
| 10|     F| 71905|Elizabeth|
| 11|     M| 62942|  Michael|
| 12|     F| 84231|    Karen|
| 13|     M| 87368|  Matthew|
| 14|     F| 52069|  Jessica|
| 15|     M|106021|   Daniel|
| 16|     F| 86462|    Linda|
| 17|     F| 60773|     Mary|
| 18|     F| 58825| Jennifer|
| 19|     F|116277| Patricia|
| 20|     F|102352|  Jessica|
+---+------+------+---------+
only showing top 20 rows



In [97]:
#when and otherwise function
dfs.select(F.when(dfs.gender == 'M', "Male").when(dfs.gender == 'F', "Female").otherwise("Unknown").alias("Gender")).show()

+------+
|Gender|
+------+
|Female|
|Female|
|  Male|
|  Male|
|Female|
|Female|
|  Male|
|Female|
|Female|
|Female|
|  Male|
|Female|
|  Male|
|Female|
|  Male|
|Female|
|Female|
|Female|
|Female|
|Female|
+------+
only showing top 20 rows



In [93]:
#replace compared to when function
dfs.replace({'M':'Male','F':'Female'}).show()

+---+------+------+---------+
| id|gender|salary|     name|
+---+------+------+---------+
|  1|Female| 56751| Jennifer|
|  2|Female| 60905|  Barbara|
|  3|  Male| 90792|     John|
|  4|  Male| 81428|   Robert|
|  5|Female|113324|    Sarah|
|  6|Female| 36599|  Barbara|
|  7|  Male|115530|    James|
|  8|Female| 57885|    Susan|
|  9|Female| 54782|    Susan|
| 10|Female| 71905|Elizabeth|
| 11|  Male| 62942|  Michael|
| 12|Female| 84231|    Karen|
| 13|  Male| 87368|  Matthew|
| 14|Female| 52069|  Jessica|
| 15|  Male|106021|   Daniel|
| 16|Female| 86462|    Linda|
| 17|Female| 60773|     Mary|
| 18|Female| 58825| Jennifer|
| 19|Female|116277| Patricia|
| 20|Female|102352|  Jessica|
+---+------+------+---------+
only showing top 20 rows



In [100]:
#casting
dfs.select(dfs.salary.cast('float').alias('new_salary')).show()


+----------+
|new_salary|
+----------+
|   56751.0|
|   60905.0|
|   90792.0|
|   81428.0|
|  113324.0|
|   36599.0|
|  115530.0|
|   57885.0|
|   54782.0|
|   71905.0|
|   62942.0|
|   84231.0|
|   87368.0|
|   52069.0|
|  106021.0|
|   86462.0|
|   60773.0|
|   58825.0|
|  116277.0|
|  102352.0|
+----------+
only showing top 20 rows



In [101]:
#print in descending order
dfs.orderBy('salary',ascending=False).show()

+---+------+------+---------+
| id|gender|salary|     name|
+---+------+------+---------+
| 50|     F|119995| Jennifer|
| 49|     M|117603|  William|
| 19|     F|116277| Patricia|
|  7|     M|115530|    James|
| 51|     F|113890|Elizabeth|
|  5|     F|113324|    Sarah|
| 57|     M|112184|   Joseph|
| 48|     F|110979| Jennifer|
| 34|     M|110711|  Matthew|
| 58|     M|106981|   Joseph|
| 55|     F|106328|    Linda|
| 15|     M|106021|   Daniel|
| 79|     F|105940|     Mary|
| 40|     M|105647|  Matthew|
| 98|     M|104359|    James|
| 67|     F|103663|  Barbara|
| 61|     M|103096|  William|
| 20|     F|102352|  Jessica|
| 74|     F|100488|  Barbara|
| 70|     F| 97623|    Susan|
+---+------+------+---------+
only showing top 20 rows



In [14]:
dfs.sort(dfs.salary.desc()).show()

+---+------+------+---------+
| id|gender|salary|     name|
+---+------+------+---------+
| 50|     F|119995| Jennifer|
| 49|     M|117603|  William|
| 19|     F|116277| Patricia|
|  7|     M|115530|    James|
| 51|     F|113890|Elizabeth|
|  5|     F|113324|    Sarah|
| 57|     M|112184|   Joseph|
| 48|     F|110979| Jennifer|
| 34|     M|110711|  Matthew|
| 58|     M|106981|   Joseph|
| 55|     F|106328|    Linda|
| 15|     M|106021|   Daniel|
| 79|     F|105940|     Mary|
| 40|     M|105647|  Matthew|
| 98|     M|104359|    James|
| 67|     F|103663|  Barbara|
| 61|     M|103096|  William|
| 20|     F|102352|  Jessica|
| 74|     F|100488|  Barbara|
| 70|     F| 97623|    Susan|
+---+------+------+---------+
only showing top 20 rows



In [102]:
#salary >100000
dfs[dfs.salary>100000].show()

+---+------+------+---------+
| id|gender|salary|     name|
+---+------+------+---------+
|  5|     F|113324|    Sarah|
|  7|     M|115530|    James|
| 15|     M|106021|   Daniel|
| 19|     F|116277| Patricia|
| 20|     F|102352|  Jessica|
| 34|     M|110711|  Matthew|
| 40|     M|105647|  Matthew|
| 48|     F|110979| Jennifer|
| 49|     M|117603|  William|
| 50|     F|119995| Jennifer|
| 51|     F|113890|Elizabeth|
| 55|     F|106328|    Linda|
| 57|     M|112184|   Joseph|
| 58|     M|106981|   Joseph|
| 61|     M|103096|  William|
| 67|     F|103663|  Barbara|
| 74|     F|100488|  Barbara|
| 79|     F|105940|     Mary|
| 98|     M|104359|    James|
+---+------+------+---------+



In [104]:
#count of gender
dfs.groupBy('gender').count().show()

+------+-----+
|gender|count|
+------+-----+
|     F|   57|
|     M|   43|
+------+-----+



In [7]:
#case sensitivity 
dfs[dfs.name.isin('michael')].show()
dfs[dfs.name.isin('Michael')].show()

+---+------+------+----+
| id|gender|salary|name|
+---+------+------+----+
+---+------+------+----+

+---+------+------+-------+
| id|gender|salary|   name|
+---+------+------+-------+
| 11|     M| 62942|Michael|
+---+------+------+-------+



In [12]:
#sunstring function
dfs.select(dfs.name.substr(1,5).alias('name_with_5_letters')).show()

+-------------------+
|name_with_5_letters|
+-------------------+
|              Jenni|
|              Barba|
|               John|
|              Rober|
|              Sarah|
|              Barba|
|              James|
|              Susan|
|              Susan|
|              Eliza|
|              Micha|
|              Karen|
|              Matth|
|              Jessi|
|              Danie|
|              Linda|
|               Mary|
|              Jenni|
|              Patri|
|              Jessi|
+-------------------+
only showing top 20 rows



In [13]:
#length function
import  pyspark.sql.functions as F
dfs.select(F.length('salary')).show()

+--------------+
|length(salary)|
+--------------+
|             5|
|             5|
|             5|
|             5|
|             6|
|             5|
|             6|
|             5|
|             5|
|             5|
|             5|
|             5|
|             5|
|             5|
|             6|
|             5|
|             5|
|             5|
|             6|
|             6|
+--------------+
only showing top 20 rows

