In [1]:
import pyspark
from pyspark.sql import SparkSession
import findspark
findspark.init()

In [2]:
spark = SparkSession.builder.master("local").appName("MyEmpApp").getOrCreate()

In [3]:
spark

In [4]:
spark.sparkContext

In [5]:
DF1 = spark.read.csv(path="MySchool_v1.csv",header=True)

In [6]:
DF1.show(5)

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|
|   8_3|Sakhi|    8|     F|  912347|           79|        100|             80|
|   8_4|Rakhi|    8|     M|  912348|           66|         95|             98|
|   9_1| Kaki|    9|     F|  912349|           88|         85|             93|
+------+-----+-----+------+--------+-------------+-----------+---------------+
only showing top 5 rows



In [7]:
DF1.printSchema()

root
 |-- RollNo: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Class: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Phone No: string (nullable = true)
 |-- English Marks: string (nullable = true)
 |-- Maths Marks: string (nullable = true)
 |-- Computers Marks: string (nullable = true)



In [8]:
#Adding new column "Total" Column

from pyspark.sql.functions import col

DF1 = DF1.withColumn("Total",col("English Marks")+col("Maths Marks")+col("Computers Marks"))
DF1.show(2)

+------+-----+-----+------+--------+-------------+-----------+---------------+-----+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|Total|
+------+-----+-----+------+--------+-------------+-----------+---------------+-----+
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|285.0|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|281.0|
+------+-----+-----+------+--------+-------------+-----------+---------------+-----+
only showing top 2 rows



In [9]:
DF1.count()

12

In [10]:
DF1.groupby("Gender").count().show()
#DF1.groupBy("Gender").count().show()

+------+-----+
|Gender|count|
+------+-----+
|     F|    6|
|     M|    6|
+------+-----+



In [11]:
DF1.groupBy("Gender").count().orderBy(col("Gender"),ascending=True).show()

+------+-----+
|Gender|count|
+------+-----+
|     F|    6|
|     M|    6|
+------+-----+



In [12]:
DF1.groupBy("Class").count().show()

+-----+-----+
|Class|count|
+-----+-----+
|    8|    4|
|    9|    3|
|   10|    5|
+-----+-----+



In [13]:
DF1.groupBy("Class").count().withColumnRenamed("count","Number Of Students").show()

+-----+------------------+
|Class|Number Of Students|
+-----+------------------+
|    8|                 4|
|    9|                 3|
|   10|                 5|
+-----+------------------+



In [14]:
DF1.groupBy("Class").count().orderBy(col("Class"),ascending=True).show()

+-----+-----+
|Class|count|
+-----+-----+
|   10|    5|
|    8|    4|
|    9|    3|
+-----+-----+



#### In the above output, the Class values are not in proper Descending order

#### Can you say why the Class column values are not in proper Descneding order?

#### Ans: Check the Schema once, check the Data Type of Class Column

In [15]:
DF1.dtypes

[('RollNo', 'string'),
 ('Name', 'string'),
 ('Class', 'string'),
 ('Gender', 'string'),
 ('Phone No', 'string'),
 ('English Marks', 'string'),
 ('Maths Marks', 'string'),
 ('Computers Marks', 'string'),
 ('Total', 'double')]

In [16]:
DF2 = spark.read.csv(path="MySchool_v1.csv",header=True, inferSchema=True)
DF2.dtypes

[('RollNo', 'string'),
 ('Name', 'string'),
 ('Class', 'int'),
 ('Gender', 'string'),
 ('Phone No', 'int'),
 ('English Marks', 'int'),
 ('Maths Marks', 'int'),
 ('Computers Marks', 'int')]

In [17]:
DF2.groupBy("Class").count().orderBy(col("Class"),ascending=True).show() # ascending=False

+-----+-----+
|Class|count|
+-----+-----+
|    8|    4|
|    9|    3|
|   10|    5|
+-----+-----+



In [18]:
DF2.groupBy("Class").count().withColumnRenamed("count","Number Of Students").orderBy(col("Number Of Students"),ascending=True).show()

+-----+------------------+
|Class|Number Of Students|
+-----+------------------+
|    9|                 3|
|    8|                 4|
|   10|                 5|
+-----+------------------+



In [19]:
DF2.orderBy(col("English Marks"),ascending=True).show()

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|   8_4|Rakhi|    8|     M|  912348|           66|         95|             98|
|   8_3|Sakhi|    8|     F|  912347|           79|        100|             80|
|  10_3| Taki|   10|     F|  912354|           81|         88|             76|
|  10_1| Paki|   10|     M|  912352|           82|         94|             84|
|  10_2|Bakhi|   10|     M|  912353|           87|         86|             90|
|   9_1| Kaki|    9|     F|  912349|           88|         85|             93|
|   9_2| Jaki|    9|     M|  912350|           92|         82|             94|
|   9_3| Maki|    9|     F|  912351|           93|         83|             88|
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|
|  10_5|Gakhi|   10|     F|  912356|           95|  

In [22]:
DF2.orderBy(col("Class"),col("English Marks"),ascending=False).show() #both columns are in Descending order

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|  10_4| Laki|   10|     F|  912355|           98|         96|             96|
|  10_5|Gakhi|   10|     F|  912356|           95|         95|             95|
|  10_2|Bakhi|   10|     M|  912353|           87|         86|             90|
|  10_1| Paki|   10|     M|  912352|           82|         94|             84|
|  10_3| Taki|   10|     F|  912354|           81|         88|             76|
|   9_3| Maki|    9|     F|  912351|           93|         83|             88|
|   9_2| Jaki|    9|     M|  912350|           92|         82|             94|
|   9_1| Kaki|    9|     F|  912349|           88|         85|             93|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|
|   8_1|Nikhi|    8|     M|  912345|           95|  

In [23]:
DF2.show()

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|
|   8_3|Sakhi|    8|     F|  912347|           79|        100|             80|
|   8_4|Rakhi|    8|     M|  912348|           66|         95|             98|
|   9_1| Kaki|    9|     F|  912349|           88|         85|             93|
|   9_2| Jaki|    9|     M|  912350|           92|         82|             94|
|   9_3| Maki|    9|     F|  912351|           93|         83|             88|
|  10_1| Paki|   10|     M|  912352|           82|         94|             84|
|  10_2|Bakhi|   10|     M|  912353|           87|         86|             90|
|  10_3| Taki|   10|     F|  912354|           81|  

In [24]:
DF2.orderBy(col("Class").desc(),col("English Marks").asc()).show()

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|  10_3| Taki|   10|     F|  912354|           81|         88|             76|
|  10_1| Paki|   10|     M|  912352|           82|         94|             84|
|  10_2|Bakhi|   10|     M|  912353|           87|         86|             90|
|  10_5|Gakhi|   10|     F|  912356|           95|         95|             95|
|  10_4| Laki|   10|     F|  912355|           98|         96|             96|
|   9_1| Kaki|    9|     F|  912349|           88|         85|             93|
|   9_2| Jaki|    9|     M|  912350|           92|         82|             94|
|   9_3| Maki|    9|     F|  912351|           93|         83|             88|
|   8_4|Rakhi|    8|     M|  912348|           66|         95|             98|
|   8_3|Sakhi|    8|     F|  912347|           79|  

In [26]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def totalFunc(a,b,c):
    return a+b+c
#totalFunc(10,20,30)

# Converting function to UDF
UDFtotal = udf(totalFunc, IntegerType())

DF3 = DF2.withColumn("Total Marks",totalFunc(col("English Marks"),col("Maths Marks"),col("Computers Marks")))
DF3.show(2)

+------+-----+-----+------+--------+-------------+-----------+---------------+-----------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|Total Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+-----------+
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|        285|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|        281|
+------+-----+-----+------+--------+-------------+-----------+---------------+-----------+
only showing top 2 rows



In [28]:
from pyspark.sql.functions import udf

def genderFunc(genderType):
    if genderType=="M":
        return "Male"
    elif genderType=="F":
        return "FeMale"
    else:
        return "N/A"
    
    
UDFgender = udf(genderFunc) #by default it takes StringType()
DF2 = DF2.withColumn("Gender",UDFgender(col("Gender")))
DF2.show(6)

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|   8_1|Nikhi|    8|  Male|  912345|           95|        100|             90|
|   8_2| Akhi|    8|  Male|  912346|           96|        100|             85|
|   8_3|Sakhi|    8|FeMale|  912347|           79|        100|             80|
|   8_4|Rakhi|    8|  Male|  912348|           66|         95|             98|
|   9_1| Kaki|    9|FeMale|  912349|           88|         85|             93|
|   9_2| Jaki|    9|  Male|  912350|           92|         82|             94|
+------+-----+-----+------+--------+-------------+-----------+---------------+
only showing top 6 rows



In [29]:
DF2.filter(col("Class")==10).show() # only class 10 students
#DF3.where(col("Class")==10).show()

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|  10_1| Paki|   10|  Male|  912352|           82|         94|             84|
|  10_2|Bakhi|   10|  Male|  912353|           87|         86|             90|
|  10_3| Taki|   10|FeMale|  912354|           81|         88|             76|
|  10_4| Laki|   10|FeMale|  912355|           98|         96|             96|
|  10_5|Gakhi|   10|FeMale|  912356|           95|         95|             95|
+------+-----+-----+------+--------+-------------+-----------+---------------+



In [30]:
DF2.filter(col("Class")<10).show() #students who are lessthan the class 10

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|   8_1|Nikhi|    8|  Male|  912345|           95|        100|             90|
|   8_2| Akhi|    8|  Male|  912346|           96|        100|             85|
|   8_3|Sakhi|    8|FeMale|  912347|           79|        100|             80|
|   8_4|Rakhi|    8|  Male|  912348|           66|         95|             98|
|   9_1| Kaki|    9|FeMale|  912349|           88|         85|             93|
|   9_2| Jaki|    9|  Male|  912350|           92|         82|             94|
|   9_3| Maki|    9|FeMale|  912351|           93|         83|             88|
+------+-----+-----+------+--------+-------------+-----------+---------------+



In [32]:
DF2.filter(col("Class")!=10).show() #students who are not from class 10
# DF2.filter(~(col("Class")==10)).show()

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|   8_1|Nikhi|    8|  Male|  912345|           95|        100|             90|
|   8_2| Akhi|    8|  Male|  912346|           96|        100|             85|
|   8_3|Sakhi|    8|FeMale|  912347|           79|        100|             80|
|   8_4|Rakhi|    8|  Male|  912348|           66|         95|             98|
|   9_1| Kaki|    9|FeMale|  912349|           88|         85|             93|
|   9_2| Jaki|    9|  Male|  912350|           92|         82|             94|
|   9_3| Maki|    9|FeMale|  912351|           93|         83|             88|
+------+-----+-----+------+--------+-------------+-----------+---------------+



In [76]:
DF2.filter((col("Class")==10) | (col("Class")==9)).show()  #students from class 9, class 10

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|   9_1| Kaki|    9|FeMale|  912349|           88|         85|             93|
|   9_2| Jaki|    9|  Male|  912350|           92|         82|             94|
|   9_3| Maki|    9|FeMale|  912351|           93|         83|             88|
|  10_1| Paki|   10|  Male|  912352|           82|         94|             84|
|  10_2|Bakhi|   10|  Male|  912353|           87|         86|             90|
|  10_3| Taki|   10|FeMale|  912354|           81|         88|             76|
|  10_4| Laki|   10|FeMale|  912355|           98|         96|             96|
|  10_5|Gakhi|   10|FeMale|  912356|           95|         95|             95|
+------+-----+-----+------+--------+-------------+-----------+---------------+



In [33]:
DF2.filter(col("English Marks").isin(88,92)).show() 

+------+----+-----+------+--------+-------------+-----------+---------------+
|RollNo|Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+----+-----+------+--------+-------------+-----------+---------------+
|   9_1|Kaki|    9|FeMale|  912349|           88|         85|             93|
|   9_2|Jaki|    9|  Male|  912350|           92|         82|             94|
+------+----+-----+------+--------+-------------+-----------+---------------+



In [34]:
DF2.select(col("Class"),col("Name"),col("English Marks")).filter(col("English Marks").isin(88,92)).show()

+-----+----+-------------+
|Class|Name|English Marks|
+-----+----+-------------+
|    9|Kaki|           88|
|    9|Jaki|           92|
+-----+----+-------------+



In [35]:
DF2.select(col("Class"),col("Name"),col("English Marks")).filter(~(col("English Marks").isin(88,92))).show()

+-----+-----+-------------+
|Class| Name|English Marks|
+-----+-----+-------------+
|    8|Nikhi|           95|
|    8| Akhi|           96|
|    8|Sakhi|           79|
|    8|Rakhi|           66|
|    9| Maki|           93|
|   10| Paki|           82|
|   10|Bakhi|           87|
|   10| Taki|           81|
|   10| Laki|           98|
|   10|Gakhi|           95|
+-----+-----+-------------+



In [36]:
DF2.filter(col("Class").contains(8)).show()

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|   8_1|Nikhi|    8|  Male|  912345|           95|        100|             90|
|   8_2| Akhi|    8|  Male|  912346|           96|        100|             85|
|   8_3|Sakhi|    8|FeMale|  912347|           79|        100|             80|
|   8_4|Rakhi|    8|  Male|  912348|           66|         95|             98|
+------+-----+-----+------+--------+-------------+-----------+---------------+



In [37]:
DF2.filter(
    (col("Class").contains(8))
    &
    (col("Name").contains("ak"))                 
    ).show()

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|   8_3|Sakhi|    8|FeMale|  912347|           79|        100|             80|
|   8_4|Rakhi|    8|  Male|  912348|           66|         95|             98|
+------+-----+-----+------+--------+-------------+-----------+---------------+



In [38]:
DF2.filter(
    (col("Class").contains(8))
    &
    (col("Maths Marks")<=100)                 
    ).show()

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|   8_1|Nikhi|    8|  Male|  912345|           95|        100|             90|
|   8_2| Akhi|    8|  Male|  912346|           96|        100|             85|
|   8_3|Sakhi|    8|FeMale|  912347|           79|        100|             80|
|   8_4|Rakhi|    8|  Male|  912348|           66|         95|             98|
+------+-----+-----+------+--------+-------------+-----------+---------------+



In [39]:
DF2.filter(col("Name").startswith("N")).show()

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|   8_1|Nikhi|    8|  Male|  912345|           95|        100|             90|
+------+-----+-----+------+--------+-------------+-----------+---------------+



In [40]:
DF2.filter(col("Name").endswith("aki")).show()

+------+----+-----+------+--------+-------------+-----------+---------------+
|RollNo|Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+----+-----+------+--------+-------------+-----------+---------------+
|   9_1|Kaki|    9|FeMale|  912349|           88|         85|             93|
|   9_2|Jaki|    9|  Male|  912350|           92|         82|             94|
|   9_3|Maki|    9|FeMale|  912351|           93|         83|             88|
|  10_1|Paki|   10|  Male|  912352|           82|         94|             84|
|  10_3|Taki|   10|FeMale|  912354|           81|         88|             76|
|  10_4|Laki|   10|FeMale|  912355|           98|         96|             96|
+------+----+-----+------+--------+-------------+-----------+---------------+



In [43]:
DF3.filter(col("Name").like("%")).show() # gives all the rows as output, because % means one, zero or multiple characters 

+------+-----+-----+------+--------+-------------+-----------+---------------+-----------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|Total Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+-----------+
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|        285|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|        281|
|   8_3|Sakhi|    8|     F|  912347|           79|        100|             80|        259|
|   8_4|Rakhi|    8|     M|  912348|           66|         95|             98|        259|
|   9_1| Kaki|    9|     F|  912349|           88|         85|             93|        266|
|   9_2| Jaki|    9|     M|  912350|           92|         82|             94|        268|
|   9_3| Maki|    9|     F|  912351|           93|         83|             88|        264|
|  10_1| Paki|   10|     M|  912352|           82|         94|             84|        260|

In [44]:
DF3.filter(col("Name").like("_k%")).show() # only one character before k 

+------+----+-----+------+--------+-------------+-----------+---------------+-----------+
|RollNo|Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|Total Marks|
+------+----+-----+------+--------+-------------+-----------+---------------+-----------+
|   8_2|Akhi|    8|     M|  912346|           96|        100|             85|        281|
+------+----+-----+------+--------+-------------+-----------+---------------+-----------+



In [45]:
DF3.filter(col("Name").like("__k%")).show() # two characters before k

+------+-----+-----+------+--------+-------------+-----------+---------------+-----------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|Total Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+-----------+
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|        285|
|   8_3|Sakhi|    8|     F|  912347|           79|        100|             80|        259|
|   8_4|Rakhi|    8|     M|  912348|           66|         95|             98|        259|
|   9_1| Kaki|    9|     F|  912349|           88|         85|             93|        266|
|   9_2| Jaki|    9|     M|  912350|           92|         82|             94|        268|
|   9_3| Maki|    9|     F|  912351|           93|         83|             88|        264|
|  10_1| Paki|   10|     M|  912352|           82|         94|             84|        260|
|  10_2|Bakhi|   10|     M|  912353|           87|         86|             90|        263|

In [46]:
DF3.filter(col("Name").like("B_k%")).show() # First B, any one character, k

+------+-----+-----+------+--------+-------------+-----------+---------------+-----------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|Total Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+-----------+
|  10_2|Bakhi|   10|     M|  912353|           87|         86|             90|        263|
+------+-----+-----+------+--------+-------------+-----------+---------------+-----------+



In [47]:
DF3.groupby(col("Class")).count().show()

+-----+-----+
|Class|count|
+-----+-----+
|    9|    3|
|    8|    4|
|   10|    5|
+-----+-----+



In [48]:
DF3.groupby(col("Class")).avg().show()
# DF3.groupby(col("Class")).mean().show() # both are same

+-----+----------+-------------+------------------+-----------------+--------------------+----------------+
|Class|avg(Class)|avg(Phone No)|avg(English Marks)| avg(Maths Marks)|avg(Computers Marks)|avg(Total Marks)|
+-----+----------+-------------+------------------+-----------------+--------------------+----------------+
|    9|       9.0|     912350.0|              91.0|83.33333333333333|   91.66666666666667|           266.0|
|    8|       8.0|     912346.5|              84.0|            98.75|               88.25|           271.0|
|   10|      10.0|     912354.0|              88.6|             91.8|                88.2|           268.6|
+-----+----------+-------------+------------------+-----------------+--------------------+----------------+



In [49]:
DF3.groupby(col("Class")).min().show()

+-----+----------+-------------+------------------+----------------+--------------------+----------------+
|Class|min(Class)|min(Phone No)|min(English Marks)|min(Maths Marks)|min(Computers Marks)|min(Total Marks)|
+-----+----------+-------------+------------------+----------------+--------------------+----------------+
|    9|         9|       912349|                88|              82|                  88|             264|
|    8|         8|       912345|                66|              95|                  80|             259|
|   10|        10|       912352|                81|              86|                  76|             245|
+-----+----------+-------------+------------------+----------------+--------------------+----------------+



In [50]:
DF3.groupby(col("Class")).max().show()

+-----+----------+-------------+------------------+----------------+--------------------+----------------+
|Class|max(Class)|max(Phone No)|max(English Marks)|max(Maths Marks)|max(Computers Marks)|max(Total Marks)|
+-----+----------+-------------+------------------+----------------+--------------------+----------------+
|    9|         9|       912351|                93|              85|                  94|             268|
|    8|         8|       912348|                96|             100|                  98|             285|
|   10|        10|       912356|                98|              96|                  96|             290|
+-----+----------+-------------+------------------+----------------+--------------------+----------------+



In [51]:
# we have sum() also to find the sum of column values
# DF3.groupby(col("Class")).sum().show()

In [52]:
DF3.dtypes

[('RollNo', 'string'),
 ('Name', 'string'),
 ('Class', 'int'),
 ('Gender', 'string'),
 ('Phone No', 'int'),
 ('English Marks', 'int'),
 ('Maths Marks', 'int'),
 ('Computers Marks', 'int'),
 ('Total Marks', 'int')]

In [53]:
DF3.show()

+------+-----+-----+------+--------+-------------+-----------+---------------+-----------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|Total Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+-----------+
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|        285|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|        281|
|   8_3|Sakhi|    8|     F|  912347|           79|        100|             80|        259|
|   8_4|Rakhi|    8|     M|  912348|           66|         95|             98|        259|
|   9_1| Kaki|    9|     F|  912349|           88|         85|             93|        266|
|   9_2| Jaki|    9|     M|  912350|           92|         82|             94|        268|
|   9_3| Maki|    9|     F|  912351|           93|         83|             88|        264|
|  10_1| Paki|   10|     M|  912352|           82|         94|             84|        260|

In [54]:
DF3.agg(
    {
            "Class": "min"        
    }

).show()

+----------+
|min(Class)|
+----------+
|         8|
+----------+



In [55]:

DF3.groupby("Class").agg(
    {
            "Maths Marks":"max",
            "English Marks":"max",
            "Computers Marks":"max",            
            "Total Marks": "min", # will not come in o/p
            "Total Marks": "max" # will come in o/p, because, in dictionary morethan one same keys are there, last key value pair will be considered
        
    }

).show()

+-----+--------------------+----------------+------------------+----------------+
|Class|max(Computers Marks)|max(Total Marks)|max(English Marks)|max(Maths Marks)|
+-----+--------------------+----------------+------------------+----------------+
|    9|                  94|             268|                93|              85|
|    8|                  98|             285|                96|             100|
|   10|                  96|             290|                98|              96|
+-----+--------------------+----------------+------------------+----------------+



In [56]:
from pyspark.sql.functions import sum,avg,max,min,mean,count

In [57]:
DF3.groupby( col("Class") ).agg(  
    
    min(col("Total Marks")), 
    max(col("Total Marks")),    
    min(col("English Marks")),
    max(col("English Marks")),
   

).show()

+-----+----------------+----------------+------------------+------------------+
|Class|min(Total Marks)|max(Total Marks)|min(English Marks)|max(English Marks)|
+-----+----------------+----------------+------------------+------------------+
|    9|             264|             268|                88|                93|
|    8|             259|             285|                66|                96|
|   10|             245|             290|                81|                98|
+-----+----------------+----------------+------------------+------------------+



In [58]:
DF3.groupby( col("Class") ).agg( 
    avg(col("English Marks")), # mean(col("English Marks"))  both are same 
    count(col("English Marks")),
    sum(col("English Marks"))
).show()

+-----+------------------+--------------------+------------------+
|Class|avg(English Marks)|count(English Marks)|sum(English Marks)|
+-----+------------------+--------------------+------------------+
|    9|              91.0|                   3|               273|
|    8|              84.0|                   4|               336|
|   10|              88.6|                   5|               443|
+-----+------------------+--------------------+------------------+

