# Select() Function
select() is a transformation function that returns a new DataFrame with the desired columns as specified in the inputs. It accepts a single argument columns that can be list of string, list of column objects, * or list comprehension.

In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("DD").master("local[*]").getOrCreate()

In [3]:
data=[(1,"Shreyas",24)]
schema=["Id","Name","Age"]
df=spark.createDataFrame(data,schema)
df.show()

df.select("Id","Age").show() # we can mention them within quotes
df.select(df.Id,df.Age).show() # we can mention them as dataframe.colname
df.select(df['Id'],df['Age']).show() # we can also use column indexing of the dataframe

+---+-------+---+
| Id|   Name|Age|
+---+-------+---+
|  1|Shreyas| 24|
+---+-------+---+

+---+---+
| Id|Age|
+---+---+
|  1| 24|
+---+---+

+---+---+
| Id|Age|
+---+---+
|  1| 24|
+---+---+

+---+---+
| Id|Age|
+---+---+
|  1| 24|
+---+---+



In [4]:
# we can also use col object from SQL.functions
from pyspark.sql.functions import col
df.select(col("ID"),col("Name")).show()

#We can also mention column name as a list inside select
df.select(["Id","Name"]).show()

#To select all the column we can use below two methods
df.select("*").show()
df.select([col for col in df.columns]).show() # df.columns will return a list of all the columns in that dataframe

+---+-------+
| ID|   Name|
+---+-------+
|  1|Shreyas|
+---+-------+

+---+-------+
| Id|   Name|
+---+-------+
|  1|Shreyas|
+---+-------+

+---+-------+---+
| Id|   Name|Age|
+---+-------+---+
|  1|Shreyas| 24|
+---+-------+---+

+---+-------+---+
| Id|   Name|Age|
+---+-------+---+
|  1|Shreyas| 24|
+---+-------+---+



In [None]:
#Now lets talk about selectexpr
#Spark SQL function selectExpr() is similar to select(), the difference being it takes a set of SQL expressions in a string to execute. This gives an ability to run SQL like expressions without creating a temporary table and views.

# consider we have to use an alias name while selecting a column. We can do it using expr
from pyspark.sql.functions import expr
df.select(expr("Id as Identity"),"Name").show()

#However SparkSQL provides with SelectExpr function which will take a string as input.
df.selectExpr("Id as Identity","Name").show()

#This selectExpr is not only used for this , but also for complex SQL like operations to be performed on the dataframe.
df.selectExpr("Age+100 as New_Age").show() # we can see that now age is added by 10. Similarly whatever we can perform in SQL select can be performed here, such as count, sum, etc.

df.selectExpr("Sum(Age)","Max(Age)").show()

In [None]:
Basically selectExpr will take string values as input and these strings can be SQL