In [1]:
# Install PySpark
!pip install pyspark



In [2]:
# Import PySpark module
import pyspark

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("pyspark").getOrCreate()

In [4]:
# Read CSV file without specifying header
df = spark.read.csv("/content/College_Students.csv")
df.show()

+----------+--------------+-----------------+--------+--------------+---------+------+---+---------+---------------+-----------+-----------------+
|       _c0|           _c1|              _c2|     _c3|           _c4|      _c5|   _c6|_c7|      _c8|            _c9|       _c10|             _c11|
+----------+--------------+-----------------+--------+--------------+---------+------+---+---------+---------------+-----------+-----------------+
|Student_ID|          Name|           Course|Semester|Attendance (%)|Marks (%)|Gender|Age|     City|Hostel Resident|Club Member|Internship Status|
|      S001|   Aditi Verma|B.Sc Data Science|       2|            92|       85|  Male| 20|   Mumbai|            Yes|         No|        Completed|
|      S002|   Rahul Mehta|            B.Com|       4|            78|       73|  Male| 24|  Kolkata|             No|        Yes|      Not Started|
|      S003|   Sneha Gupta|       B.Tech CSE|       6|            88|       81|Female| 24|   Jaipur|             No|  

In [5]:
# Read CSV with header option set to True
df = spark.read.option("header", True).csv("/content/College_Students.csv")
df.show()

+----------+--------------+-----------------+--------+--------------+---------+------+---+---------+---------------+-----------+-----------------+
|Student_ID|          Name|           Course|Semester|Attendance (%)|Marks (%)|Gender|Age|     City|Hostel Resident|Club Member|Internship Status|
+----------+--------------+-----------------+--------+--------------+---------+------+---+---------+---------------+-----------+-----------------+
|      S001|   Aditi Verma|B.Sc Data Science|       2|            92|       85|  Male| 20|   Mumbai|            Yes|         No|        Completed|
|      S002|   Rahul Mehta|            B.Com|       4|            78|       73|  Male| 24|  Kolkata|             No|        Yes|      Not Started|
|      S003|   Sneha Gupta|       B.Tech CSE|       6|            88|       81|Female| 24|   Jaipur|             No|        Yes|        Completed|
|      S004|    Aman Singh|              BBA|       2|            65|       62|  Male| 21|Hyderabad|             No|  

In [6]:
# Check column data types (they are still all strings without schema inference)
df.printSchema()

root
 |-- Student_ID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Course: string (nullable = true)
 |-- Semester: string (nullable = true)
 |-- Attendance (%): string (nullable = true)
 |-- Marks (%): string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Hostel Resident: string (nullable = true)
 |-- Club Member: string (nullable = true)
 |-- Internship Status: string (nullable = true)



In [7]:
# Read CSV with both header and schema inference
df = spark.read.option("header", True).option("inferschema", True).csv("/content/College_Students.csv")
df.printSchema()

root
 |-- Student_ID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Course: string (nullable = true)
 |-- Semester: integer (nullable = true)
 |-- Attendance (%): integer (nullable = true)
 |-- Marks (%): integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- City: string (nullable = true)
 |-- Hostel Resident: string (nullable = true)
 |-- Club Member: string (nullable = true)
 |-- Internship Status: string (nullable = true)



In [8]:
# Show statistical summary (count, mean, stddev, min, max)
df.describe().show()

+-------+----------+-----------+------+------------------+-----------------+-----------------+------+------------------+---------+---------------+-----------+-----------------+
|summary|Student_ID|       Name|Course|          Semester|   Attendance (%)|        Marks (%)|Gender|               Age|     City|Hostel Resident|Club Member|Internship Status|
+-------+----------+-----------+------+------------------+-----------------+-----------------+------+------------------+---------+---------------+-----------+-----------------+
|  count|        20|         20|    20|                20|               20|               20|    20|                20|       20|             20|         20|               20|
|   mean|      NULL|       NULL|  NULL|               3.3|             82.8|             77.4|  NULL|              21.3|     NULL|           NULL|       NULL|             NULL|
| stddev|      NULL|       NULL|  NULL|1.8093325317714033|9.242920477174321|9.051897155031268|  NULL|2.002629849919

In [9]:
# More detailed summary statistics
df.summary().show()

+-------+----------+-----------+------+------------------+-----------------+-----------------+------+------------------+---------+---------------+-----------+-----------------+
|summary|Student_ID|       Name|Course|          Semester|   Attendance (%)|        Marks (%)|Gender|               Age|     City|Hostel Resident|Club Member|Internship Status|
+-------+----------+-----------+------+------------------+-----------------+-----------------+------+------------------+---------+---------------+-----------+-----------------+
|  count|        20|         20|    20|                20|               20|               20|    20|                20|       20|             20|         20|               20|
|   mean|      NULL|       NULL|  NULL|               3.3|             82.8|             77.4|  NULL|              21.3|     NULL|           NULL|       NULL|             NULL|
| stddev|      NULL|       NULL|  NULL|1.8093325317714033|9.242920477174321|9.051897155031268|  NULL|2.002629849919