In [0]:
employee_list = [(1,"bob","jackson",1500.0,"AUSTRALIA","+61 9890989789","345 23 5645"),
            (2,"hill","martin",750.0,"UK","+44 3465789709","390 45 7598")]

In [0]:
emp = spark.createDataFrame(employee_list, schema = """employee_id INT, first_name STRING, last_name STRING, salary FLOAT, nationality STRING,
                            phone_number STRING, ssn STRING """)

In [0]:
from pyspark.sql.functions import *

In [0]:
emp.show()

+-----------+----------+---------+------+-----------+--------------+-----------+
|employee_id|first_name|last_name|salary|nationality|  phone_number|        ssn|
+-----------+----------+---------+------+-----------+--------------+-----------+
|          1|       bob|  jackson|1500.0|  AUSTRALIA|+61 9890989789|345 23 5645|
|          2|      hill|   martin| 750.0|         UK|+44 3465789709|390 45 7598|
+-----------+----------+---------+------+-----------+--------------+-----------+



In [0]:
emp.withColumn("full_name", concat("first_name","last_name")).show()

+-----------+----------+---------+------+-----------+--------------+-----------+----------+
|employee_id|first_name|last_name|salary|nationality|  phone_number|        ssn| full_name|
+-----------+----------+---------+------+-----------+--------------+-----------+----------+
|          1|       bob|  jackson|1500.0|  AUSTRALIA|+61 9890989789|345 23 5645|bobjackson|
|          2|      hill|   martin| 750.0|         UK|+44 3465789709|390 45 7598|hillmartin|
+-----------+----------+---------+------+-----------+--------------+-----------+----------+



In [0]:
help(concat_ws)

Help on function concat_ws in module pyspark.sql.functions:

concat_ws(sep: str, *cols: 'ColumnOrName') -> pyspark.sql.column.Column
    Concatenates multiple input string columns together into a single string column,
    using the given separator.
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd'])
    >>> df.select(concat_ws('-', df.s, df.d).alias('s')).collect()
    [Row(s='abcd-123')]



In [0]:
emp.withColumn("full_name", concat_ws("-","first_name","last_name")).show()

+-----------+----------+---------+------+-----------+--------------+-----------+-----------+
|employee_id|first_name|last_name|salary|nationality|  phone_number|        ssn|  full_name|
+-----------+----------+---------+------+-----------+--------------+-----------+-----------+
|          1|       bob|  jackson|1500.0|  AUSTRALIA|+61 9890989789|345 23 5645|bob-jackson|
|          2|      hill|   martin| 750.0|         UK|+44 3465789709|390 45 7598|hill-martin|
+-----------+----------+---------+------+-----------+--------------+-----------+-----------+



In [0]:
emp.withColumn("full_name", concat_ws(lit(", "),"first_name","last_name")).show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
[0;32m<command-3025750883095284>[0m in [0;36m<cell line: 1>[0;34m()[0m
[0;32m----> 1[0;31m [0memp[0m[0;34m.[0m[0mwithColumn[0m[0;34m([0m[0;34m"full_name"[0m[0;34m,[0m [0mconcat_ws[0m[0;34m([0m[0mlit[0m[0;34m([0m[0;34m", "[0m[0;34m)[0m[0;34m,[0m[0;34m"first_name"[0m[0;34m,[0m[0;34m"last_name"[0m[0;34m)[0m[0;34m)[0m[0;34m.[0m[0mshow[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m/databricks/spark/python/pyspark/sql/functions.py[0m in [0;36mconcat_ws[0;34m(sep, *cols)[0m
[1;32m   2974[0m     [0msc[0m [0;34m=[0m [0mSparkContext[0m[0;34m.[0m[0m_active_spark_context[0m[0;34m[0m[0;34m[0m[0m
[1;32m   2975[0m     [0;32massert[0m [0msc[0m [0;32mis[0m [0;32mnot[0m [0;32mNone[0m [0;32mand[0m [0msc[0m[0;34m.[0m[0m_jvm[0m [

In [0]:
address = [
    {
        "id": 10,
        "address": "9 debs parkway",
        "city": "new york city",
        "state": "new york",
        "country": "US",
        "postal_code": "10090"
    },
    {
        "id": 11,
        "address": "3645 dayton hill",
        "city": "newton",
        "state": "massachusetts",
        "country": "US",
        "postal_code": "02162"
    },
    {
        "id": 12,
        "address": "1551 6th plaza",
        "city": "modesto",
        "state": "california",
        "country": "US",
        "postal_code": "95354"
    },
    {
        "id": 13,
        "address": "7849 ohio drive",
        "city": "springfield",
        "state": "missouri",
        "country": "US",
        "postal_code": "10120"
    }
]

In [0]:
addressdf = spark.createDataFrame(address)

In [0]:
addressdf.show()

+----------------+-------------+-------+---+-----------+-------------+
|         address|         city|country| id|postal_code|        state|
+----------------+-------------+-------+---+-----------+-------------+
|  9 debs parkway|new york city|     US| 10|      10090|     new york|
|3645 dayton hill|       newton|     US| 11|      02162|massachusetts|
|  1551 6th plaza|      modesto|     US| 12|      95354|   california|
| 7849 ohio drive|  springfield|     US| 13|      10120|     missouri|
+----------------+-------------+-------+---+-----------+-------------+



In [0]:
addressdf.select('id', concat_ws(', ', 'address','city','state','country','postal_code').alias('full_address')).show(truncate = False)

+---+--------------------------------------------------+
|id |full_address                                      |
+---+--------------------------------------------------+
|10 |9 debs parkway, new york city, new york, US, 10090|
|11 |3645 dayton hill, newton, massachusetts, US, 02162|
|12 |1551 6th plaza, modesto, california, US, 95354    |
|13 |7849 ohio drive, springfield, missouri, US, 10120 |
+---+--------------------------------------------------+



In [0]:
from pyspark.sql.functions import *

In [0]:
help(initcap)

Help on function initcap in module pyspark.sql.functions:

initcap(col: 'ColumnOrName') -> pyspark.sql.column.Column
    Translate the first letter of each word to upper case in the sentence.
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> spark.createDataFrame([('ab cd',)], ['a']).select(initcap("a").alias('v')).collect()
    [Row(v='Ab Cd')]



In [0]:
help(length)

Help on function length in module pyspark.sql.functions:

length(col: 'ColumnOrName') -> pyspark.sql.column.Column
    Computes the character length of string data or number of bytes of binary data.
    The length of character data includes the trailing spaces. The length of binary data
    includes binary zeros.
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> spark.createDataFrame([('ABC ',)], ['a']).select(length('a').alias('length')).collect()
    [Row(length=4)]



In [0]:
addressdf.select('id','state'). \
    withColumn("state_upper", upper('state')).\
    withColumn("state_lower", lower('state')). \
    withColumn("state_initcap", initcap("state")).\
    withColumn("state_length", length("state")).show()

+---+-------------+-------------+-------------+-------------+------------+
| id|        state|  state_upper|  state_lower|state_initcap|state_length|
+---+-------------+-------------+-------------+-------------+------------+
| 10|     new york|     NEW YORK|     new york|     New York|           8|
| 11|massachusetts|MASSACHUSETTS|massachusetts|Massachusetts|          13|
| 12|   california|   CALIFORNIA|   california|   California|          10|
| 13|     missouri|     MISSOURI|     missouri|     Missouri|           8|
+---+-------------+-------------+-------------+-------------+------------+

