In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = "/home/ec2-user/spark-2.4.4-bin-hadoop2.7/python"
os.environ["JAVA_HOME"] = "/usr/java/jdk1.8.0_161/jre"
os.environ["SPARK_HOME"] = "/home/ec2-user/spark-2.4.4-bin-hadoop2.7"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

# Spark DataFrame Basics


Import SparkSession

In [2]:
from pyspark.sql import SparkSession

Then start the SparkSession

In [3]:
spark = SparkSession.builder.appName("Basics").getOrCreate()

In [4]:
spark

## Creating a DataFrame

#### Reading data
You will first need to get the data from a file.

.read method (runs on a sparksession)

can read from a verity of file types .csv .json etc

In [15]:
df = spark.read.csv('employee_data.csv', inferSchema=True, header=True)

In [16]:
df

DataFrame[name: string, salary: double, age: double]

#### Showing the data

.show method  (call on a dataframe, is an action)

In [17]:
df.show()

+-------+--------+----+
|   name|  salary| age|
+-------+--------+----+
|Michael| 40000.0|29.0|
|   Andy| 50000.0|30.0|
| Justin| 30000.0|19.0|
|Michael| 45000.0|45.0|
|  Sandy| 57000.0|37.0|
|  David| 31000.0|30.0|
|   Bill|210000.0|35.0|
|   Elon|150000.0|37.0|
| Donald|910000.0|49.0|
|  Kumar|720000.0|41.0|
| Russel| 30800.0|40.0|
|  Peter| 29000.0|39.0|
|   John| 18000.0|32.0|
|  Kevin| 99000.0|55.0|
|  Rocky| 88000.0|60.0|
+-------+--------+----+



#### Schema of data

.printSchema call on a dataframe

show the columns names and the data type of that column

In [18]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- age: double (nullable = true)



#### Column names 

.columns call on a dataFrame

to get the column names in the data frames. 

In [19]:
df.columns

['name', 'salary', 'age']

.describe methods calculates the statisticals figures of columns which are neumericals. If used with .show() we will get the actual stats

In [21]:
df.describe().show()

+-------+-----+------------------+------------------+
|summary| name|            salary|               age|
+-------+-----+------------------+------------------+
|  count|   15|                15|                15|
|   mean| null|167186.66666666666| 38.53333333333333|
| stddev| null|  270544.809773239|10.588853078771539|
|    min| Andy|           18000.0|              19.0|
|    max|Sandy|          910000.0|              60.0|
+-------+-----+------------------+------------------+



### Grabbing the data

#### Columns

In this section we will look at commands whcih can help us extract data from datqframes.

If we call a columns in this syntax we get a column object as output


In [22]:
df.select('age').show()

+----+
| age|
+----+
|29.0|
|30.0|
|19.0|
|45.0|
|37.0|
|30.0|
|35.0|
|37.0|
|49.0|
|41.0|
|40.0|
|39.0|
|32.0|
|55.0|
|60.0|
+----+



#### Multiple Columns:

In [24]:
df.select(['name','salary']).show()

+-------+--------+
|   name|  salary|
+-------+--------+
|Michael| 40000.0|
|   Andy| 50000.0|
| Justin| 30000.0|
|Michael| 45000.0|
|  Sandy| 57000.0|
|  David| 31000.0|
|   Bill|210000.0|
|   Elon|150000.0|
| Donald|910000.0|
|  Kumar|720000.0|
| Russel| 30800.0|
|  Peter| 29000.0|
|   John| 18000.0|
|  Kevin| 99000.0|
|  Rocky| 88000.0|
+-------+--------+




#### Calling Rows

In [26]:
df.head(2)

[Row(name='Michael', salary=40000.0, age=29.0),
 Row(name='Andy', salary=50000.0, age=30.0)]

In [27]:
row_list = df.head(5)

In [28]:
row_list

[Row(name='Michael', salary=40000.0, age=29.0),
 Row(name='Andy', salary=50000.0, age=30.0),
 Row(name='Justin', salary=30000.0, age=19.0),
 Row(name='Michael', salary=45000.0, age=45.0),
 Row(name='Sandy', salary=57000.0, age=37.0)]

In [29]:
row_list[0]

Row(name='Michael', salary=40000.0, age=29.0)

Gives a list of rows. Ther number of rows to be displayed is given as argument to .head()

### Creating new columns

In [30]:
df.withColumn('monthly_salary', df['salary']/12).show()

+-------+--------+----+------------------+
|   name|  salary| age|    monthly_salary|
+-------+--------+----+------------------+
|Michael| 40000.0|29.0|3333.3333333333335|
|   Andy| 50000.0|30.0| 4166.666666666667|
| Justin| 30000.0|19.0|            2500.0|
|Michael| 45000.0|45.0|            3750.0|
|  Sandy| 57000.0|37.0|            4750.0|
|  David| 31000.0|30.0|2583.3333333333335|
|   Bill|210000.0|35.0|           17500.0|
|   Elon|150000.0|37.0|           12500.0|
| Donald|910000.0|49.0| 75833.33333333333|
|  Kumar|720000.0|41.0|           60000.0|
| Russel| 30800.0|40.0|2566.6666666666665|
|  Peter| 29000.0|39.0|2416.6666666666665|
|   John| 18000.0|32.0|            1500.0|
|  Kevin| 99000.0|55.0|            8250.0|
|  Rocky| 88000.0|60.0| 7333.333333333333|
+-------+--------+----+------------------+



In [31]:
df2 = df.withColumn('monthly_salary', df['salary']/12)

In [32]:
df2.show()

+-------+--------+----+------------------+
|   name|  salary| age|    monthly_salary|
+-------+--------+----+------------------+
|Michael| 40000.0|29.0|3333.3333333333335|
|   Andy| 50000.0|30.0| 4166.666666666667|
| Justin| 30000.0|19.0|            2500.0|
|Michael| 45000.0|45.0|            3750.0|
|  Sandy| 57000.0|37.0|            4750.0|
|  David| 31000.0|30.0|2583.3333333333335|
|   Bill|210000.0|35.0|           17500.0|
|   Elon|150000.0|37.0|           12500.0|
| Donald|910000.0|49.0| 75833.33333333333|
|  Kumar|720000.0|41.0|           60000.0|
| Russel| 30800.0|40.0|2566.6666666666665|
|  Peter| 29000.0|39.0|2416.6666666666665|
|   John| 18000.0|32.0|            1500.0|
|  Kevin| 99000.0|55.0|            8250.0|
|  Rocky| 88000.0|60.0| 7333.333333333333|
+-------+--------+----+------------------+



This added column is temporary as dataframes are immutable. to create a new dataframe with this new column assign the same to a new varialbe.

Renaming the column name:

In [33]:
df2.withColumnRenamed('salary','yearly_salary').show()

+-------+-------------+----+------------------+
|   name|yearly_salary| age|    monthly_salary|
+-------+-------------+----+------------------+
|Michael|      40000.0|29.0|3333.3333333333335|
|   Andy|      50000.0|30.0| 4166.666666666667|
| Justin|      30000.0|19.0|            2500.0|
|Michael|      45000.0|45.0|            3750.0|
|  Sandy|      57000.0|37.0|            4750.0|
|  David|      31000.0|30.0|2583.3333333333335|
|   Bill|     210000.0|35.0|           17500.0|
|   Elon|     150000.0|37.0|           12500.0|
| Donald|     910000.0|49.0| 75833.33333333333|
|  Kumar|     720000.0|41.0|           60000.0|
| Russel|      30800.0|40.0|2566.6666666666665|
|  Peter|      29000.0|39.0|2416.6666666666665|
|   John|      18000.0|32.0|            1500.0|
|  Kevin|      99000.0|55.0|            8250.0|
|  Rocky|      88000.0|60.0| 7333.333333333333|
+-------+-------------+----+------------------+



In [34]:
df3 = df2.withColumnRenamed('salary','yearly_salary')

In [35]:
df3.show()

+-------+-------------+----+------------------+
|   name|yearly_salary| age|    monthly_salary|
+-------+-------------+----+------------------+
|Michael|      40000.0|29.0|3333.3333333333335|
|   Andy|      50000.0|30.0| 4166.666666666667|
| Justin|      30000.0|19.0|            2500.0|
|Michael|      45000.0|45.0|            3750.0|
|  Sandy|      57000.0|37.0|            4750.0|
|  David|      31000.0|30.0|2583.3333333333335|
|   Bill|     210000.0|35.0|           17500.0|
|   Elon|     150000.0|37.0|           12500.0|
| Donald|     910000.0|49.0| 75833.33333333333|
|  Kumar|     720000.0|41.0|           60000.0|
| Russel|      30800.0|40.0|2566.6666666666665|
|  Peter|      29000.0|39.0|2416.6666666666665|
|   John|      18000.0|32.0|            1500.0|
|  Kevin|      99000.0|55.0|            8250.0|
|  Rocky|      88000.0|60.0| 7333.333333333333|
+-------+-------------+----+------------------+



All these operation of renaming and cearing new columns are also temporary