<a href="https://colab.research.google.com/github/PavanBJ/PySpark/blob/main/Core_Concepts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#setting up a spark session
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

## Bringing data into dataframes

In [2]:
df = spark.read.format("csv").option("header","true").load("original.csv")
df.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude|  Longitude|
+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.4967184|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572|103.5218199|
|  3|    Alvera|  Di Boldi|Female|           null|                null|$57576.52|39.9947462|116.3397725|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.1300171|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.6489954|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16|53.4266145| -6.1644997|
|  7|     Masha|    Divers|Female|         Dachun|     

In [3]:
df2 = spark.read.csv("original.csv", header = True)
df2.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude|  Longitude|
+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.4967184|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572|103.5218199|
|  3|    Alvera|  Di Boldi|Female|           null|                null|$57576.52|39.9947462|116.3397725|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.1300171|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.6489954|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16|53.4266145| -6.1644997|
|  7|     Masha|    Divers|Female|         Dachun|     

In [5]:
df2.dtypes

[('id', 'string'),
 ('first_name', 'string'),
 ('last_name', 'string'),
 ('gender', 'string'),
 ('City', 'string'),
 ('JobTitle', 'string'),
 ('Salary', 'string'),
 ('Latitude', 'string'),
 ('Longitude', 'string')]

In [6]:
df2

DataFrame[id: string, first_name: string, last_name: string, gender: string, City: string, JobTitle: string, Salary: string, Latitude: string, Longitude: string]

In [11]:
from pyspark.sql.types import *
schema = StructType([
    StructField('id',IntegerType()),
    StructField('first_name',StringType()),
    StructField('last_name',StringType()),
    StructField('gender',StringType()),
    StructField('city',StringType()),
    StructField('job_title',StringType()),
    StructField('salary',StringType()),
    StructField('latitude',StringType()),
    StructField('longitude',FloatType())
])

In [12]:
df3 = spark.read.csv('original.csv', header = True, schema = schema)
df3.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
| id|first_name| last_name|gender|           city|           job_title|   salary|  latitude| longitude|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.496717|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572| 103.52182|
|  3|    Alvera|  Di Boldi|Female|           null|                null|$57576.52|39.9947462|116.339775|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.130016|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.648994|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16|53.4266145|-6.1644998|
|  7|     Masha|    Divers|Female|         Dachun|              

In [13]:
df3.dtypes

[('id', 'int'),
 ('first_name', 'string'),
 ('last_name', 'string'),
 ('gender', 'string'),
 ('city', 'string'),
 ('job_title', 'string'),
 ('salary', 'string'),
 ('latitude', 'string'),
 ('longitude', 'float')]

# Inspecting a Dataframe

In [14]:
from pyspark.sql.types import *
myschema = StructType([
    StructField('id',IntegerType()),
    StructField('first_name',StringType()),
    StructField('last_name',StringType()),
    StructField('gender',StringType()),
    StructField('city',StringType()),
    StructField('job_title',StringType()),
    StructField('salary',StringType()),
    StructField('latitude',FloatType()),
    StructField('longitude',FloatType())
])

In [15]:
df = spark.read.csv('original.csv', header = True, schema = myschema)
df.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
| id|first_name| last_name|gender|           city|           job_title|   salary|  latitude| longitude|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18| 50.577408| 16.496717|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|  48.82316| 103.52182|
|  3|    Alvera|  Di Boldi|Female|           null|                null|$57576.52| 39.994747|116.339775|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23| 44.504723| 38.130016|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.648994|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16| 53.426613|-6.1644998|
|  7|     Masha|    Divers|Female|         Dachun|              

In [16]:
df.dtypes

[('id', 'int'),
 ('first_name', 'string'),
 ('last_name', 'string'),
 ('gender', 'string'),
 ('city', 'string'),
 ('job_title', 'string'),
 ('salary', 'string'),
 ('latitude', 'float'),
 ('longitude', 'float')]

In [17]:
df.head()

Row(id=1, first_name='Melinde', last_name='Shilburne', gender='Female', city='Nowa Ruda', job_title='Assistant Professor', salary='$57438.18', latitude=50.57740783691406, longitude=16.49671745300293)

In [18]:
df.head(10)

[Row(id=1, first_name='Melinde', last_name='Shilburne', gender='Female', city='Nowa Ruda', job_title='Assistant Professor', salary='$57438.18', latitude=50.57740783691406, longitude=16.49671745300293),
 Row(id=2, first_name='Kimberly', last_name='Von Welden', gender='Female', city='Bulgan', job_title='Programmer II', salary='$62846.60', latitude=48.823158264160156, longitude=103.52182006835938),
 Row(id=3, first_name='Alvera', last_name='Di Boldi', gender='Female', city=None, job_title=None, salary='$57576.52', latitude=39.994747161865234, longitude=116.33977508544922),
 Row(id=4, first_name='Shannon', last_name="O'Griffin", gender='Male', city='Divnomorskoye', job_title='Budget/Accounting Analyst II', salary='$61489.23', latitude=44.504722595214844, longitude=38.1300163269043),
 Row(id=5, first_name='Sherwood', last_name='Macieja', gender='Male', city='Mytishchi', job_title='VP Sales', salary='$63863.09', latitude=None, longitude=37.64899444580078),
 Row(id=6, first_name='Maris', last

In [19]:
df.describe()

DataFrame[summary: string, id: string, first_name: string, last_name: string, gender: string, city: string, job_title: string, salary: string, latitude: string, longitude: string]

In [21]:
df.describe().show()

+-------+-----------------+----------+---------+------+-------------------+-------------------+---------+------------------+-----------------+
|summary|               id|first_name|last_name|gender|               city|          job_title|   salary|          latitude|        longitude|
+-------+-----------------+----------+---------+------+-------------------+-------------------+---------+------------------+-----------------+
|  count|             1000|      1000|     1000|  1000|                999|                998|     1000|               999|             1000|
|   mean|            500.5|      null|     null|  null|               null|               null|     null| 25.43151724702484|43.33756460386515|
| stddev|288.8194360957494|      null|     null|  null|               null|               null|     null|24.579082550156635| 69.4206453674681|
|    min|                1|   Abagail|    Abbay|Female|             Abéché|Account Coordinator|$10101.92|         -54.28115|       -123.04196|

In [22]:
df.first()

Row(id=1, first_name='Melinde', last_name='Shilburne', gender='Female', city='Nowa Ruda', job_title='Assistant Professor', salary='$57438.18', latitude=50.57740783691406, longitude=16.49671745300293)

In [27]:
#learning - only first row can be printed using first(), major difference between head() and first

In [30]:
df.tail(10)

[Row(id=991, first_name='Mic', last_name='Benbough', gender='Male', city='Arbeláez', job_title='GIS Technical Architect', salary='$89017.71', latitude=4.272792816162109, longitude=-74.416015625),
 Row(id=992, first_name='Scotti', last_name='Dusey', gender='Male', city='Medveditskiy', job_title='Senior Cost Accountant', salary='$75714.21', latitude=50.7842903137207, longitude=44.712886810302734),
 Row(id=993, first_name='Levi', last_name='Ramsell', gender='Male', city='Lluchubamba', job_title='Account Coordinator', salary='$84527.06', latitude=-7.523183345794678, longitude=-77.97100830078125),
 Row(id=994, first_name='Nilson', last_name='Dupree', gender='Male', city='Öjebyn', job_title='Legal Assistant', salary='$14880.62', latitude=65.40260314941406, longitude=21.188669204711914),
 Row(id=995, first_name='Niki', last_name='Ashbrook', gender='Male', city='Novozavidovskiy', job_title='Media Manager II', salary='$67437.88', latitude=56.55147933959961, longitude=36.43471908569336),
 Row(id

In [31]:
df.columns

['id',
 'first_name',
 'last_name',
 'gender',
 'city',
 'job_title',
 'salary',
 'latitude',
 'longitude']

In [32]:
df.count()

1000

In [33]:
df.distinct()

DataFrame[id: int, first_name: string, last_name: string, gender: string, city: string, job_title: string, salary: string, latitude: float, longitude: float]

In [34]:
df.distinct().count()

1000