Install spark dependencies

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark

Set environmental path

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

Run a local spark session

In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

unzip the dataset

In [4]:
!unzip archive.zip

Archive:  archive.zip
replace collegePlace.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: collegePlace.csv        


In [42]:
from pyspark.sql.functions import col, round

Load the data into Pyspark

In [43]:
college_place_df = spark.read.csv("collegePlace.csv", header=True, inferSchema=True)

Extract first 5 rows from the spark dataframe

In [44]:
college_place_df.show(5)

+---+------+--------------------+-----------+----+------+-----------------+-----------+
|Age|Gender|              Stream|Internships|CGPA|Hostel|HistoryOfBacklogs|PlacedOrNot|
+---+------+--------------------+-----------+----+------+-----------------+-----------+
| 22|  Male|Electronics And C...|          1|   8|     1|                1|          1|
| 21|Female|    Computer Science|          0|   7|     1|                1|          1|
| 22|Female|Information Techn...|          1|   6|     0|                0|          1|
| 21|  Male|Information Techn...|          0|   8|     0|                1|          1|
| 22|  Male|          Mechanical|          0|   8|     1|                0|          1|
+---+------+--------------------+-----------+----+------+-----------------+-----------+
only showing top 5 rows



Display the columns names in college_place_df

In [45]:
print(college_place_df.columns)

['Age', 'Gender', 'Stream', 'Internships', 'CGPA', 'Hostel', 'HistoryOfBacklogs', 'PlacedOrNot']


Filter the dataframe by Age

In [46]:
college_place_21_df = college_place_df.filter(college_place_df.Age == 21)
college_place_22_df = college_place_df.filter(college_place_df.Age == 22)

Find the dimension of dataframe of spark dataframe

In [47]:
print((college_place_21_df.count(), len(college_place_21_df.columns)))
print((college_place_22_df.count(), len(college_place_22_df.columns)))

(1084, 8)
(941, 8)


Find the average cgpa, number of students placed and total number of students in each stream

In [48]:
college_place_by_stream = college_place_df.groupBy("Stream").agg({'CGPA':'avg', 'PlacedOrNot':'sum'})
college_student_by_stream = college_place_df.groupBy("Stream").agg({'PlacedOrNot':'count'})

In [62]:
college_place_by_stream.printSchema()
college_student_by_stream.printSchema()

root
 |-- Stream: string (nullable = true)
 |-- Average_CGPA: double (nullable = true)
 |-- Number_of_Students_Placed: long (nullable = true)

root
 |-- Stream: string (nullable = true)
 |-- Number_of_Students: long (nullable = false)



Rename columns in spark dataframe

In [50]:
college_place_by_stream = college_place_by_stream.withColumnRenamed("avg(CGPA)","Average_CGPA") \
    .withColumnRenamed("sum(PlacedOrNot)","Number_of_Students_Placed")

college_student_by_stream = college_student_by_stream.withColumnRenamed("count(PlacedOrNot)","Number_of_Students")

In [51]:
#college_place_by_stream.printSchema()
#college_student_by_stream.printSchema()

In [52]:
#college_place_by_stream.show()
#college_student_by_stream.show()

Join college_place_by_stream and college_student_by_stream

In [53]:
college_placement_join = college_place_by_stream.join(college_student_by_stream,['Stream'],"inner")

In [54]:
college_placement_join.show()

+--------------------+-----------------+-------------------------+------------------+
|              Stream|     Average_CGPA|Number_of_Students_Placed|Number_of_Students|
+--------------------+-----------------+-------------------------+------------------+
|          Mechanical|7.063679245283019|                      200|               424|
|Information Techn...|7.073806078147612|                      409|               691|
|Electronics And C...|            7.125|                      251|               424|
|               Civil|7.094637223974764|                      146|               317|
|    Computer Science|7.039948453608248|                      452|               776|
|          Electrical|7.080838323353293|                      181|               334|
+--------------------+-----------------+-------------------------+------------------+



Change the datatype of columns

In [55]:
college_placement_join = college_placement_join.withColumn("Number_of_Students_Placed",col("Number_of_Students_Placed").cast("int"))\
                      .withColumn("Number_of_Students",col("Number_of_Students").cast("int"))

Create derived column - Percentage of students placed

In [56]:
college_placement_join = college_placement_join.withColumn("percent_placed", round((col("Number_of_Students_Placed")/col("Number_of_Students"))*100,2))

In [57]:
college_placement_join.show()

+--------------------+-----------------+-------------------------+------------------+--------------+
|              Stream|     Average_CGPA|Number_of_Students_Placed|Number_of_Students|percent_placed|
+--------------------+-----------------+-------------------------+------------------+--------------+
|          Mechanical|7.063679245283019|                      200|               424|         47.17|
|Information Techn...|7.073806078147612|                      409|               691|         59.19|
|Electronics And C...|            7.125|                      251|               424|          59.2|
|               Civil|7.094637223974764|                      146|               317|         46.06|
|    Computer Science|7.039948453608248|                      452|               776|         58.25|
|          Electrical|7.080838323353293|                      181|               334|         54.19|
+--------------------+-----------------+-------------------------+------------------+------

Finding which Stream has highest number of placed students

In [58]:
college_placement_join = college_placement_join.sort(college_placement_join.Number_of_Students_Placed.desc())

In [59]:
college_placement_join.show()

+--------------------+-----------------+-------------------------+------------------+--------------+
|              Stream|     Average_CGPA|Number_of_Students_Placed|Number_of_Students|percent_placed|
+--------------------+-----------------+-------------------------+------------------+--------------+
|    Computer Science|7.039948453608248|                      452|               776|         58.25|
|Information Techn...|7.073806078147612|                      409|               691|         59.19|
|Electronics And C...|            7.125|                      251|               424|          59.2|
|          Mechanical|7.063679245283019|                      200|               424|         47.17|
|          Electrical|7.080838323353293|                      181|               334|         54.19|
|               Civil|7.094637223974764|                      146|               317|         46.06|
+--------------------+-----------------+-------------------------+------------------+------

Sort by number of students placed

In [60]:
college_placement_join = college_placement_join.sort(college_placement_join.percent_placed.desc())

In [61]:
college_placement_join.show()

+--------------------+-----------------+-------------------------+------------------+--------------+
|              Stream|     Average_CGPA|Number_of_Students_Placed|Number_of_Students|percent_placed|
+--------------------+-----------------+-------------------------+------------------+--------------+
|Electronics And C...|            7.125|                      251|               424|          59.2|
|Information Techn...|7.073806078147612|                      409|               691|         59.19|
|    Computer Science|7.039948453608248|                      452|               776|         58.25|
|          Electrical|7.080838323353293|                      181|               334|         54.19|
|          Mechanical|7.063679245283019|                      200|               424|         47.17|
|               Civil|7.094637223974764|                      146|               317|         46.06|
+--------------------+-----------------+-------------------------+------------------+------