In [1]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Test').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/15 11:15:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
employees = [(1, "Scott", "Tiger", 1000.0, 
                      "united states", "+1 123 456 7890", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, 
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, 
                      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 
                      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
                     )
                ]

In [4]:
employeesDF = spark. \
    createDataFrame(employees,
                    schema="""employee_id INT, first_name STRING, 
                    last_name STRING, salary FLOAT, nationality STRING,
                    phone_number STRING, ssn STRING"""
                   )

In [5]:
employeesDF. \
    select("employee_id","first_name", "last_name").orderBy('employee_id').show()

[Stage 0:>                                                          (0 + 4) / 4]

+-----------+----------+---------+
|employee_id|first_name|last_name|
+-----------+----------+---------+
|          1|     Scott|    Tiger|
|          2|     Henry|     Ford|
|          3|      Nick|   Junior|
|          4|      Bill|    Gomes|
+-----------+----------+---------+



                                                                                

In [6]:
employeesDF. \
    groupBy("nationality"). \
    count(). \
    show()

+--------------+-----+
|   nationality|count|
+--------------+-----+
| united states|    1|
|         India|    1|
|united KINGDOM|    1|
|     AUSTRALIA|    1|
+--------------+-----+



In [7]:
employeesDF. \
    orderBy("employee_id"). \
    show()

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+--------------+----------------+-----------+



In [8]:
employeesDF. \
    orderBy(employeesDF.employee_id.desc()). \
    show()

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+--------------+----------------+-----------+
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+--------------+----------------+-----------+



In [9]:
employeesDF.orderBy(col('employee_id').desc()).show()

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+--------------+----------------+-----------+
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+--------------+----------------+-----------+



In [10]:
employeesDF.select(upper(col('first_name')).alias('FName'),upper(col('last_name')).alias('LName')).show()

+-----+------+
|FName| LName|
+-----+------+
|SCOTT| TIGER|
|HENRY|  FORD|
| NICK|JUNIOR|
| BILL| GOMES|
+-----+------+



In [11]:
employeesDF.show()

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+--------------+----------------+-----------+



In [12]:
employeesDF.groupby(upper(col('nationality'))).count().show()

+------------------+-----+
|upper(nationality)|count|
+------------------+-----+
|     UNITED STATES|    1|
|             INDIA|    1|
|    UNITED KINGDOM|    1|
|         AUSTRALIA|    1|
+------------------+-----+



In [17]:
empDF = employeesDF.withColumn('Description',
                       concat(upper(col('first_name')),lit(' '),lower(col('last_name')),
                              lit(' of '),initcap(col('nationality'))))

In [18]:
employeesDF.show()

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+--------------+----------------+-----------+



In [19]:
empDF.show()

+-----------+----------+---------+------+--------------+----------------+-----------+--------------------+
|employee_id|first_name|last_name|salary|   nationality|    phone_number|        ssn|         Description|
+-----------+----------+---------+------+--------------+----------------+-----------+--------------------+
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|SCOTT tiger of Un...|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123| HENRY ford of India|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|NICK junior of Un...|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|BILL gomes of Aus...|
+-----------+----------+---------+------+--------------+----------------+-----------+--------------------+



In [27]:
empDF.select('Description').show(truncate = False)

+-----------------------------+
|Description                  |
+-----------------------------+
|SCOTT tiger of United States |
|HENRY ford of India          |
|NICK junior of United Kingdom|
|BILL gomes of Australia      |
+-----------------------------+

