## Selecting Columns

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Filtering').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/08 16:46:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/08 16:46:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
df_pyspark = spark.read.csv('/Users/kvenkateshrao/Downloads/Tableau Full course/Season.csv',header=True,inferSchema=True)

In [3]:
df_pyspark.show()

+---------+-----------+-------------+-------------+--------------------+
|Season_Id|Season_Year|Orange_Cap_Id|Purple_Cap_Id|Man_of_the_Series_Id|
+---------+-----------+-------------+-------------+--------------------+
|        1|       2008|          100|          102|                  32|
|        2|       2009|           18|           61|                  53|
|        3|       2010|          133|          131|                 133|
|        4|       2011|          162|          194|                 162|
|        5|       2012|          162|          190|                 315|
|        6|       2013|           19|           71|                  32|
|        7|       2014|           46|          364|                 305|
|        8|       2015|          187|           71|                 334|
|        9|       2016|            8|          299|                   8|
+---------+-----------+-------------+-------------+--------------------+



# Single column selection

In [4]:
df_pyspark.select('Season_Id').show()

+---------+
|Season_Id|
+---------+
|        1|
|        2|
|        3|
|        4|
|        5|
|        6|
|        7|
|        8|
|        9|
+---------+



# Multiple column selection

In [5]:
df_pyspark.select(['Season_Id','Season_Year']).show()

+---------+-----------+
|Season_Id|Season_Year|
+---------+-----------+
|        1|       2008|
|        2|       2009|
|        3|       2010|
|        4|       2011|
|        5|       2012|
|        6|       2013|
|        7|       2014|
|        8|       2015|
|        9|       2016|
+---------+-----------+



## Filtering Rows

In [6]:
df_pyspark.filter(df_pyspark.Season_Year>2010).show()

+---------+-----------+-------------+-------------+--------------------+
|Season_Id|Season_Year|Orange_Cap_Id|Purple_Cap_Id|Man_of_the_Series_Id|
+---------+-----------+-------------+-------------+--------------------+
|        4|       2011|          162|          194|                 162|
|        5|       2012|          162|          190|                 315|
|        6|       2013|           19|           71|                  32|
|        7|       2014|           46|          364|                 305|
|        8|       2015|          187|           71|                 334|
|        9|       2016|            8|          299|                   8|
+---------+-----------+-------------+-------------+--------------------+



25/01/08 16:47:05 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


## We can filter using comparison operators (>, <, >=, etc.) and string functions like startswith(), contains(), and endswith().

In [7]:
df_pyspark.filter((df_pyspark.Season_Year>2010)&(df_pyspark.Season_Id<=7)).show()

+---------+-----------+-------------+-------------+--------------------+
|Season_Id|Season_Year|Orange_Cap_Id|Purple_Cap_Id|Man_of_the_Series_Id|
+---------+-----------+-------------+-------------+--------------------+
|        4|       2011|          162|          194|                 162|
|        5|       2012|          162|          190|                 315|
|        6|       2013|           19|           71|                  32|
|        7|       2014|           46|          364|                 305|
+---------+-----------+-------------+-------------+--------------------+



## Renaming and Dropping Columns

In [8]:
df_pyspark=df_pyspark.withColumnRenamed('Season_Year','Year')
df_pyspark=df_pyspark.drop('Man_of_the_Series_Id')
df_pyspark.show()

+---------+----+-------------+-------------+
|Season_Id|Year|Orange_Cap_Id|Purple_Cap_Id|
+---------+----+-------------+-------------+
|        1|2008|          100|          102|
|        2|2009|           18|           61|
|        3|2010|          133|          131|
|        4|2011|          162|          194|
|        5|2012|          162|          190|
|        6|2013|           19|           71|
|        7|2014|           46|          364|
|        8|2015|          187|           71|
|        9|2016|            8|          299|
+---------+----+-------------+-------------+



## Column Operations

## Adding a New Column With a Constant Value


In [9]:
from pyspark.sql.functions import lit
df_pyspark = df_pyspark.withColumn("Country", lit("India")) 
df_pyspark.show()


+---------+----+-------------+-------------+-------+
|Season_Id|Year|Orange_Cap_Id|Purple_Cap_Id|Country|
+---------+----+-------------+-------------+-------+
|        1|2008|          100|          102|  India|
|        2|2009|           18|           61|  India|
|        3|2010|          133|          131|  India|
|        4|2011|          162|          194|  India|
|        5|2012|          162|          190|  India|
|        6|2013|           19|           71|  India|
|        7|2014|           46|          364|  India|
|        8|2015|          187|           71|  India|
|        9|2016|            8|          299|  India|
+---------+----+-------------+-------------+-------+



# Using Expressions and Functions

In [10]:
df_pyspark=df_pyspark.withColumn('Year after 2 years', df_pyspark.Year+2)
df_pyspark.show()

+---------+----+-------------+-------------+-------+------------------+
|Season_Id|Year|Orange_Cap_Id|Purple_Cap_Id|Country|Year after 2 years|
+---------+----+-------------+-------------+-------+------------------+
|        1|2008|          100|          102|  India|              2010|
|        2|2009|           18|           61|  India|              2011|
|        3|2010|          133|          131|  India|              2012|
|        4|2011|          162|          194|  India|              2013|
|        5|2012|          162|          190|  India|              2014|
|        6|2013|           19|           71|  India|              2015|
|        7|2014|           46|          364|  India|              2016|
|        8|2015|          187|           71|  India|              2017|
|        9|2016|            8|          299|  India|              2018|
+---------+----+-------------+-------------+-------+------------------+

