# Column Concatenation in PySpark: -
##### use concat() function in order to concatenation of two columns
### we have at least four possibilities 
##### 1. concatenate two columns without space
##### 2. concatenate columns with single space
##### 3. concatenate columns with ("-")
##### 4. concatenate columns with numerica and catagorical columns


In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
conf = pyspark.SparkConf().setAppName("Typecast String to Date and Date to String").setMaster("local")
from pyspark.sql import SQLContext
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)
sql_c = SQLContext(sc)

In [25]:
df = sql_c.read.csv('transaction.csv', header=True)
df.show(3)

+---------+-----------+--------+----------+---------+
|     Date|Description|Deposits|Withdrawls|  Balance|
+---------+-----------+--------+----------+---------+
|21-Aug-20|   Reversal|       0| 10,612.64|53,063.19|
|21-Aug-20| Commission|       0| 26,531.60|26,531.60|
|21-Aug-20| Debit Card|       0|  8,843.87|17,687.73|
+---------+-----------+--------+----------+---------+
only showing top 3 rows



In [26]:
# Concatenate two columns in pyspark with single space
from pyspark.sql.functions import concat, lit, col
df1=df.select("*", concat(col("Deposits"),lit(" "),col("Balance")).alias("Deposits verses Balance"))
df1.show(5)

+---------+-----------+---------+----------+---------+-----------------------+
|     Date|Description| Deposits|Withdrawls|  Balance|Deposits verses Balance|
+---------+-----------+---------+----------+---------+-----------------------+
|21-Aug-20|   Reversal|        0| 10,612.64|53,063.19|            0 53,063.19|
|21-Aug-20| Commission|        0| 26,531.60|26,531.60|            0 26,531.60|
|21-Aug-20| Debit Card|        0|  8,843.87|17,687.73|            0 17,687.73|
|21-Aug-20|       Cash|23,475.67|         0|41,163.40|    23,475.67 41,163.40|
|21-Aug-20|   Interest|        0|  5,145.43|36,017.98|            0 36,017.98|
+---------+-----------+---------+----------+---------+-----------------------+
only showing top 5 rows



In [27]:
# Concatenate columns in pyspark with single space
from pyspark.sql import functions as F

In [28]:
df2 = df.withColumn('joined_column', F.concat(F.col('Description'),F.lit(' '),F.col('Deposits')))
df2.show(5)

+---------+-----------+---------+----------+---------+--------------+
|     Date|Description| Deposits|Withdrawls|  Balance| joined_column|
+---------+-----------+---------+----------+---------+--------------+
|21-Aug-20|   Reversal|        0| 10,612.64|53,063.19|    Reversal 0|
|21-Aug-20| Commission|        0| 26,531.60|26,531.60|  Commission 0|
|21-Aug-20| Debit Card|        0|  8,843.87|17,687.73|  Debit Card 0|
|21-Aug-20|       Cash|23,475.67|         0|41,163.40|Cash 23,475.67|
|21-Aug-20|   Interest|        0|  5,145.43|36,017.98|    Interest 0|
+---------+-----------+---------+----------+---------+--------------+
only showing top 5 rows



In [29]:
# Concatenate two columns without space
df3=df.select("*", concat(col("Description"),col("Balance")).alias("Description_Balance"))
df3.show(5)


+---------+-----------+---------+----------+---------+-------------------+
|     Date|Description| Deposits|Withdrawls|  Balance|Description_Balance|
+---------+-----------+---------+----------+---------+-------------------+
|21-Aug-20|   Reversal|        0| 10,612.64|53,063.19|  Reversal53,063.19|
|21-Aug-20| Commission|        0| 26,531.60|26,531.60|Commission26,531.60|
|21-Aug-20| Debit Card|        0|  8,843.87|17,687.73|Debit Card17,687.73|
|21-Aug-20|       Cash|23,475.67|         0|41,163.40|      Cash41,163.40|
|21-Aug-20|   Interest|        0|  5,145.43|36,017.98|  Interest36,017.98|
+---------+-----------+---------+----------+---------+-------------------+
only showing top 5 rows



In [31]:
# Concatenate two columns with hyphen
df4=df.select("*", concat(col("Description"),lit("-"),col("Deposits")).alias("Description-Deposits"))
df4.show(5)

+---------+-----------+---------+----------+---------+--------------------+
|     Date|Description| Deposits|Withdrawls|  Balance|Description-Deposits|
+---------+-----------+---------+----------+---------+--------------------+
|21-Aug-20|   Reversal|        0| 10,612.64|53,063.19|          Reversal-0|
|21-Aug-20| Commission|        0| 26,531.60|26,531.60|        Commission-0|
|21-Aug-20| Debit Card|        0|  8,843.87|17,687.73|        Debit Card-0|
|21-Aug-20|       Cash|23,475.67|         0|41,163.40|      Cash-23,475.67|
|21-Aug-20|   Interest|        0|  5,145.43|36,017.98|          Interest-0|
+---------+-----------+---------+----------+---------+--------------------+
only showing top 5 rows

