In [1]:
import pyspark


In [2]:
from pyspark.sql import SparkSession, Row

spark = SparkSession.builder.appName("RowExample").getOrCreate()


your 131072x1 screen size is bogus. expect trouble
25/08/18 14:07:12 WARN Utils: Your hostname, KLZPC0015 resolves to a loopback address: 127.0.1.1; using 172.25.17.96 instead (on interface eth0)
25/08/18 14:07:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/18 14:07:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Creating DataFrame from a List of Tuples (Static Values)

In [3]:
data = [
    Row(name="Souvik", age=26, id=1),
    Row(name="Soukajya", age=26, id=2),
    Row(age=27, id=3, name="Akash")
]


In [4]:
df = spark.createDataFrame(data)
df.show()


                                                                                

+--------+---+----+
|    name|age|  id|
+--------+---+----+
|  Souvik| 26|   1|
|Soukajya| 26|   2|
|      27|  3|NULL|
+--------+---+----+



#### To fix this column order issue - 

##### Option 1: Use Consistant Row defination

In [5]:
data = [
    Row(name="Souvik", age=26, id=1),
    Row(name="Soukajya", age=26, id=2),
    Row(name="Akash", age=27, id=3)   # fixed order
]

df = spark.createDataFrame(data)
df.show()


                                                                                

+--------+---+---+
|    name|age| id|
+--------+---+---+
|  Souvik| 26|  1|
|Soukajya| 26|  2|
|   Akash| 27|  3|
+--------+---+---+



                                                                                

##### Option 2: Define a schema explicitly

In [6]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("id", IntegerType(), True),
])


df = spark.createDataFrame(data, schema=schema)
df.show()


[Stage 5:>                                                          (0 + 3) / 3]

+--------+---+---+
|    name|age| id|
+--------+---+---+
|  Souvik| 26|  1|
|Soukajya| 26|  2|
|   Akash| 27|  3|
+--------+---+---+



                                                                                

In [7]:
df.printSchema()


root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- id: integer (nullable = true)



### Creating a DataFrame using spark.createDataFrame() with a List of Dict

In [8]:
data = [
    {"id": 1, "name": "Souvik", "age": 26},
    {"id": 2, "name": "Soukajya", "age": 26},
    {"id": 3, "name": "Akash", "age": 27}
]

df = spark.createDataFrame(data)
df.show()


[Stage 7:>                                                          (0 + 3) / 3]

+---+---+--------+
|age| id|    name|
+---+---+--------+
| 26|  1|  Souvik|
| 26|  2|Soukajya|
| 27|  3|   Akash|
+---+---+--------+



                                                                                

#### Add Column names to DataFrame and Check Schema

In [9]:
data = [
    (1, "Dipankar", 27),
    (2, "Shankha", 26),
    (3, "Tuhin", 27)
]

df = spark.createDataFrame(data)
df.show()

df.printSchema()


[Stage 9:>                                                          (0 + 3) / 3]

+---+--------+---+
| _1|      _2| _3|
+---+--------+---+
|  1|Dipankar| 27|
|  2| Shankha| 26|
|  3|   Tuhin| 27|
+---+--------+---+

root
 |-- _1: long (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: long (nullable = true)



                                                                                

In [10]:
df = df.toDF("id", "name", "age")
df.show()


[Stage 11:>                                                         (0 + 3) / 3]

+---+--------+---+
| id|    name|age|
+---+--------+---+
|  1|Dipankar| 27|
|  2| Shankha| 26|
|  3|   Tuhin| 27|
+---+--------+---+



                                                                                

In [11]:
data = [
    (1, "Dipankar", 27),
    (2, "Shankha", 26),
    (3, "Tuhin", 27)
]

df = spark.createDataFrame(data).toDF("id", "name", "age")
df.show()


[Stage 13:>                                                         (0 + 3) / 3]

+---+--------+---+
| id|    name|age|
+---+--------+---+
|  1|Dipankar| 27|
|  2| Shankha| 26|
|  3|   Tuhin| 27|
+---+--------+---+



                                                                                

In [12]:
df.printSchema()


root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



#### Use createDataFrame() with Schema in PySpark

In [13]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

myschema = StructType(
    [
        StructField("id", IntegerType(), False),
        StructField("name", StringType(), True),
        StructField("age", IntegerType(), True)
    ]
)

df.printSchema()

data = [
    (1, "Dipankar", 27),
    (2, None, 26),
    (3, "Tuhin", None)
]

df = spark.createDataFrame(data, schema=myschema)
df.show()


root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



[Stage 15:>                                                         (0 + 3) / 3]

+---+--------+----+
| id|    name| age|
+---+--------+----+
|  1|Dipankar|  27|
|  2|    NULL|  26|
|  3|   Tuhin|NULL|
+---+--------+----+



                                                                                

#### show() func in PySpark

In [14]:
data = [
    (1, "Dipankar", 27),
    (2, "Shankha", 26),
    (3, "Tuhin, one of my highter secondary school friend, who is a cousin brother of my another secondary school friend", 27)
]

df = spark.createDataFrame(data).toDF("id", "name", "age")
df.show()


[Stage 17:>                                                         (0 + 3) / 3]

+---+--------------------+---+
| id|                name|age|
+---+--------------------+---+
|  1|            Dipankar| 27|
|  2|             Shankha| 26|
|  3|Tuhin, one of my ...| 27|
+---+--------------------+---+



                                                                                

In [15]:
df.show(2)


                                                                                

+---+--------+---+
| id|    name|age|
+---+--------+---+
|  1|Dipankar| 27|
|  2| Shankha| 26|
+---+--------+---+
only showing top 2 rows



In [16]:
df.show(truncate=True)




+---+--------------------+---+
| id|                name|age|
+---+--------------------+---+
|  1|            Dipankar| 27|
|  2|             Shankha| 26|
|  3|Tuhin, one of my ...| 27|
+---+--------------------+---+



                                                                                

In [17]:
df.show(truncate=False)


[Stage 23:>                                                         (0 + 3) / 3]

+---+---------------------------------------------------------------------------------------------------------------+---+
|id |name                                                                                                           |age|
+---+---------------------------------------------------------------------------------------------------------------+---+
|1  |Dipankar                                                                                                       |27 |
|2  |Shankha                                                                                                        |26 |
|3  |Tuhin, one of my highter secondary school friend, who is a cousin brother of my another secondary school friend|27 |
+---+---------------------------------------------------------------------------------------------------------------+---+



                                                                                

In [18]:
df.show(vertical=True)


                                                                                

-RECORD 0--------------------
 id   | 1                    
 name | Dipankar             
 age  | 27                   
-RECORD 1--------------------
 id   | 2                    
 name | Shankha              
 age  | 26                   
-RECORD 2--------------------
 id   | 3                    
 name | Tuhin, one of my ... 
 age  | 27                   



                                                                                

In [19]:
df.show(truncate=10)


                                                                                

+---+----------+---+
| id|      name|age|
+---+----------+---+
|  1|  Dipankar| 27|
|  2|   Shankha| 26|
|  3|Tuhin, ...| 27|
+---+----------+---+



#### PySpark select() example

In [20]:
from pyspark.sql.functions import col, expr, when, lit


In [22]:
data = [
    (1, "Manta", 75000, "IT", 24),
    (2, "Dipankar", 30000, "Post Master", 27),
    (3, "Souvik", 60000, "Army Officer", 27),
    (4, "Soukarjya", 45000, "BDO", 26),
    (5, "Arvind", 35000, "Business Data Analyst", 28)
]


In [24]:
df = spark.createDataFrame(data, ["id", "name", "salary", "department", "age"])
df.show()




+---+---------+------+--------------------+---+
| id|     name|salary|          department|age|
+---+---------+------+--------------------+---+
|  1|    Manta| 75000|                  IT| 24|
|  2| Dipankar| 30000|         Post Master| 27|
|  3|   Souvik| 60000|        Army Officer| 27|
|  4|Soukarjya| 45000|                 BDO| 26|
|  5|   Arvind| 35000|Business Data Ana...| 28|
+---+---------+------+--------------------+---+



                                                                                

In [25]:
df.select("name", "salary").show()


[Stage 31:>                                                         (0 + 3) / 3]

+---------+------+
|     name|salary|
+---------+------+
|    Manta| 75000|
| Dipankar| 30000|
|   Souvik| 60000|
|Soukarjya| 45000|
|   Arvind| 35000|
+---------+------+



                                                                                

In [26]:
df.select(col("name"), col("department")).show()


[Stage 33:>                                                         (0 + 3) / 3]

+---------+--------------------+
|     name|          department|
+---------+--------------------+
|    Manta|                  IT|
| Dipankar|         Post Master|
|   Souvik|        Army Officer|
|Soukarjya|                 BDO|
|   Arvind|Business Data Ana...|
+---------+--------------------+



                                                                                

In [27]:
df.select(col("name").alias("Employee_Name"), col("salary").alias("Employee_Salary")).show()




+-------------+---------------+
|Employee_Name|Employee_Salary|
+-------------+---------------+
|        Manta|          75000|
|     Dipankar|          30000|
|       Souvik|          60000|
|    Soukarjya|          45000|
|       Arvind|          35000|
+-------------+---------------+



                                                                                

In [28]:
df.select(expr("name"), expr("salary * 1.10 as increased_salary")).show()


[Stage 37:>                                                         (0 + 3) / 3]

+---------+----------------+
|     name|increased_salary|
+---------+----------------+
|    Manta|        82500.00|
| Dipankar|        33000.00|
|   Souvik|        66000.00|
|Soukarjya|        49500.00|
|   Arvind|        38500.00|
+---------+----------------+



                                                                                

In [29]:
df.select("name", "salary", when(col("salary") > 50000, "high").otherwise("low").alias("Salary_Category")).show()




+---------+------+---------------+
|     name|salary|Salary_Category|
+---------+------+---------------+
|    Manta| 75000|           high|
| Dipankar| 30000|            low|
|   Souvik| 60000|           high|
|Soukarjya| 45000|            low|
|   Arvind| 35000|            low|
+---------+------+---------------+



                                                                                

In [31]:
df.selectExpr("name", "salary * 2 as double_salary").show()




+---------+-------------+
|     name|double_salary|
+---------+-------------+
|    Manta|       150000|
| Dipankar|        60000|
|   Souvik|       120000|
|Soukarjya|        90000|
|   Arvind|        70000|
+---------+-------------+



                                                                                

In [32]:
df.select("name", "department", lit("Active").alias("status")).show()




+---------+--------------------+------+
|     name|          department|status|
+---------+--------------------+------+
|    Manta|                  IT|Active|
| Dipankar|         Post Master|Active|
|   Souvik|        Army Officer|Active|
|Soukarjya|                 BDO|Active|
|   Arvind|Business Data Ana...|Active|
+---------+--------------------+------+



                                                                                

In [33]:
columns_to_select = ["name", "salary", "department", "age"]
df.select(*columns_to_select).show()


                                                                                

+---------+------+--------------------+---+
|     name|salary|          department|age|
+---------+------+--------------------+---+
|    Manta| 75000|                  IT| 24|
| Dipankar| 30000|         Post Master| 27|
|   Souvik| 60000|        Army Officer| 27|
|Soukarjya| 45000|                 BDO| 26|
|   Arvind| 35000|Business Data Ana...| 28|
+---------+------+--------------------+---+



In [34]:
df.select([col for col in df.columns if col != "age"]).show()


[Stage 47:>                                                         (0 + 3) / 3]

+---+---------+------+--------------------+
| id|     name|salary|          department|
+---+---------+------+--------------------+
|  1|    Manta| 75000|                  IT|
|  2| Dipankar| 30000|         Post Master|
|  3|   Souvik| 60000|        Army Officer|
|  4|Soukarjya| 45000|                 BDO|
|  5|   Arvind| 35000|Business Data Ana...|
+---+---------+------+--------------------+



                                                                                