In [1]:
# Importing the PySpark libraries

import pyspark
from pyspark.sql import SparkSession

In [3]:
# creating Spark session

spark = SparkSession.builder.appName('practice_spark').getOrCreate()

In [4]:
# check the Spark Session

spark

- In this when we are executing in local we always see there is one cluster but when we can create multiple clusters and instances when we are working on cloud.

In [26]:
import pandas as pd

df = pd.read_csv('50_Startups.csv')

In [27]:
type(df)

pandas.core.frame.DataFrame

### PySpark Dataframe

In [17]:
# Read the dataframe by loading csv file

df_movie = spark.read.csv('50_Startups.csv',header=True)

In [33]:
# There's another way to read the dataset

df_movie = spark.read.option('header','true').csv('50_Startups.csv')

In [34]:
df_movie.show()

+---------+--------------+---------------+----------+---------+
|R&D Spend|Administration|Marketing Spend|     State|   Profit|
+---------+--------------+---------------+----------+---------+
| 165349.2|      136897.8|       471784.1|  New York|192261.83|
| 162597.7|     151377.59|      443898.53|California|191792.06|
|153441.51|     101145.55|      407934.54|   Florida|191050.39|
|144372.41|     118671.85|      383199.62|  New York|182901.99|
|142107.34|      91391.77|      366168.42|   Florida|166187.94|
| 131876.9|      99814.71|      362861.36|  New York|156991.12|
|134615.46|     147198.87|      127716.82|California|156122.51|
|130298.13|     145530.06|      323876.68|   Florida| 155752.6|
|120542.52|     148718.95|      311613.29|  New York|152211.77|
|123334.88|     108679.17|      304981.62|California|149759.96|
|101913.08|     110594.11|      229160.95|   Florida|146121.95|
|100671.96|      91790.61|      249744.55|California| 144259.4|
| 93863.75|     127320.38|      249839.4

In [21]:
# check the datatype of the dataframe

type(df_movie)

pyspark.sql.dataframe.DataFrame

- Here we can see the type of the dataframe is pyspark.sql.dataframe. But in case of pandas the dataframe is type of pandas.core.frame.DataFrame

In [18]:
# We can check the columns by using the same command as we use in pandas dataframe

df_movie.columns

['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit']

In [31]:
# check the top 5 data

df_movie.head(5)

[Row(R&D Spend='165349.2', Administration='136897.8', Marketing Spend='471784.1', State='New York', Profit='192261.83'),
 Row(R&D Spend='162597.7', Administration='151377.59', Marketing Spend='443898.53', State='California', Profit='191792.06'),
 Row(R&D Spend='153441.51', Administration='101145.55', Marketing Spend='407934.54', State='Florida', Profit='191050.39'),
 Row(R&D Spend='144372.41', Administration='118671.85', Marketing Spend='383199.62', State='New York', Profit='182901.99'),
 Row(R&D Spend='142107.34', Administration='91391.77', Marketing Spend='366168.42', State='Florida', Profit='166187.94')]

In [20]:
# printSchema() function is just like info() in pandas which gives datatypes of columns

df_movie.printSchema()

root
 |-- R&D Spend: string (nullable = true)
 |-- Administration: string (nullable = true)
 |-- Marketing Spend: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Profit: string (nullable = true)



- By default the datatype of features considers as string.
- In order to make it correct we can use parameter inferschema which will consider the features datatype correct.

In [35]:
df_movie = spark.read.csv('50_Startups.csv', header = True, inferSchema = True)
df_movie.printSchema()

root
 |-- R&D Spend: double (nullable = true)
 |-- Administration: double (nullable = true)
 |-- Marketing Spend: double (nullable = true)
 |-- State: string (nullable = true)
 |-- Profit: double (nullable = true)



In [39]:
# select any perticular column

df_movie.select(['R&D Spend','State','Profit']).show(10)

+---------+----------+---------+
|R&D Spend|     State|   Profit|
+---------+----------+---------+
| 165349.2|  New York|192261.83|
| 162597.7|California|191792.06|
|153441.51|   Florida|191050.39|
|144372.41|  New York|182901.99|
|142107.34|   Florida|166187.94|
| 131876.9|  New York|156991.12|
|134615.46|California|156122.51|
|130298.13|   Florida| 155752.6|
|120542.52|  New York|152211.77|
|123334.88|California|149759.96|
+---------+----------+---------+
only showing top 10 rows



In [42]:
# check the datatypes

df_movie.dtypes

[('R&D Spend', 'double'),
 ('Administration', 'double'),
 ('Marketing Spend', 'double'),
 ('State', 'string'),
 ('Profit', 'double')]

In [44]:
df_movie.describe().show()

+-------+-----------------+------------------+------------------+----------+------------------+
|summary|        R&D Spend|    Administration|   Marketing Spend|     State|            Profit|
+-------+-----------------+------------------+------------------+----------+------------------+
|  count|               50|                50|                50|        50|                50|
|   mean|73721.61559999999|121344.63959999995|211025.09780000005|      null|112012.63920000002|
| stddev|45902.25648230754|28017.802755488683|122290.31072584528|      null|40306.180337650534|
|    min|              0.0|          51283.14|               0.0|California|           14681.4|
|    max|         165349.2|         182645.56|          471784.1|  New York|         192261.83|
+-------+-----------------+------------------+------------------+----------+------------------+



### Adding columns in PySpark dataframe

In [53]:
df_movie = df_movie.withColumn('Profit+10000', df_movie.Profit + 10000)
df_movie.show(10)

+---------+--------------+---------------+----------+---------+------------+
|R&D Spend|Administration|Marketing Spend|     State|   Profit|Profit+10000|
+---------+--------------+---------------+----------+---------+------------+
| 165349.2|      136897.8|       471784.1|  New York|192261.83|   202261.83|
| 162597.7|     151377.59|      443898.53|California|191792.06|   201792.06|
|153441.51|     101145.55|      407934.54|   Florida|191050.39|   201050.39|
|144372.41|     118671.85|      383199.62|  New York|182901.99|   192901.99|
|142107.34|      91391.77|      366168.42|   Florida|166187.94|   176187.94|
| 131876.9|      99814.71|      362861.36|  New York|156991.12|   166991.12|
|134615.46|     147198.87|      127716.82|California|156122.51|   166122.51|
|130298.13|     145530.06|      323876.68|   Florida| 155752.6|    165752.6|
|120542.52|     148718.95|      311613.29|  New York|152211.77|   162211.77|
|123334.88|     108679.17|      304981.62|California|149759.96|   159759.96|

### Dropping columns in PySpark dataframe

In [56]:
df_movie = df_movie.drop('Profit+10000')

df_movie.show(10)

+---------+--------------+---------------+----------+---------+
|R&D Spend|Administration|Marketing Spend|     State|   Profit|
+---------+--------------+---------------+----------+---------+
| 165349.2|      136897.8|       471784.1|  New York|192261.83|
| 162597.7|     151377.59|      443898.53|California|191792.06|
|153441.51|     101145.55|      407934.54|   Florida|191050.39|
|144372.41|     118671.85|      383199.62|  New York|182901.99|
|142107.34|      91391.77|      366168.42|   Florida|166187.94|
| 131876.9|      99814.71|      362861.36|  New York|156991.12|
|134615.46|     147198.87|      127716.82|California|156122.51|
|130298.13|     145530.06|      323876.68|   Florida| 155752.6|
|120542.52|     148718.95|      311613.29|  New York|152211.77|
|123334.88|     108679.17|      304981.62|California|149759.96|
+---------+--------------+---------------+----------+---------+
only showing top 10 rows



### Renaming column in PySpark dataframe

In [57]:
df_movie = df_movie.withColumnRenamed('State', 'State_N')

df_movie.show(10)

+---------+--------------+---------------+----------+---------+
|R&D Spend|Administration|Marketing Spend|   State_N|   Profit|
+---------+--------------+---------------+----------+---------+
| 165349.2|      136897.8|       471784.1|  New York|192261.83|
| 162597.7|     151377.59|      443898.53|California|191792.06|
|153441.51|     101145.55|      407934.54|   Florida|191050.39|
|144372.41|     118671.85|      383199.62|  New York|182901.99|
|142107.34|      91391.77|      366168.42|   Florida|166187.94|
| 131876.9|      99814.71|      362861.36|  New York|156991.12|
|134615.46|     147198.87|      127716.82|California|156122.51|
|130298.13|     145530.06|      323876.68|   Florida| 155752.6|
|120542.52|     148718.95|      311613.29|  New York|152211.77|
|123334.88|     108679.17|      304981.62|California|149759.96|
+---------+--------------+---------------+----------+---------+
only showing top 10 rows

