Merge 2 dataframes with uneven columns

In [19]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Merge').getOrCreate()

In [20]:
from pyspark.sql.functions import lit

input1DF = spark.read.format('csv').option('header',True).load('sample_data\input1.csv')
input1DF.show()
input2DF = spark.read.format('csv').option('header',True).load('sample_data\input2.csv')
input2DF.show()

input1AddDF = input1DF.withColumn('Gender',lit(None))
input1AddDF.show()

result = input1AddDF.union(input2DF)
result.show()

+--------+---+
|    Name|Age|
+--------+---+
| Monisha| 23|
|  Arvind| 24|
|Rishitha| 24|
|  Anusha| 24|
| Gayatri| 25|
+--------+---+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
| Monisha| 23|     F|
|  Arvind| 24|     M|
|Rishitha| 24|     F|
+--------+---+------+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
| Monisha| 23|  null|
|  Arvind| 24|  null|
|Rishitha| 24|  null|
|  Anusha| 24|  null|
| Gayatri| 25|  null|
+--------+---+------+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
| Monisha| 23|  null|
|  Arvind| 24|  null|
|Rishitha| 24|  null|
|  Anusha| 24|  null|
| Gayatri| 25|  null|
| Monisha| 23|     F|
|  Arvind| 24|     M|
|Rishitha| 24|     F|
+--------+---+------+



Another method

In [21]:
from pyspark.sql.types import *

schema = StructType(
    [
    StructField('Name',StringType(),True), # 3rd option is nullable is true or not
    StructField('Age',IntegerType(),True),
    StructField('Gender',StringType(),True),
    ]
)

input1DF = spark.read.format('csv').option('header',True).schema(schema).load('sample_data\input1.csv')
input1DF.show()
input2DF = spark.read.option('header',True).csv('sample_data\input2.csv',schema=schema)
input2DF.show()

result = input1DF.union(input2DF)
result.show()



+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
| Monisha| 23|  null|
|  Arvind| 24|  null|
|Rishitha| 24|  null|
|  Anusha| 24|  null|
| Gayatri| 25|  null|
+--------+---+------+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
| Monisha| 23|     F|
|  Arvind| 24|     M|
|Rishitha| 24|     F|
+--------+---+------+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
| Monisha| 23|  null|
|  Arvind| 24|  null|
|Rishitha| 24|  null|
|  Anusha| 24|  null|
| Gayatri| 25|  null|
| Monisha| 23|     F|
|  Arvind| 24|     M|
|Rishitha| 24|     F|
+--------+---+------+



Another method

In [24]:
input1DF = spark.read.format('csv').option('header',True).load('sample_data\input1.csv')
input1DF.show()
input2DF = spark.read.format('csv').option('header',True).load('sample_data\input2.csv')
input2DF.show()

result = input1DF.join(input2DF,on=['Name','Age'],how='outer')
result.show()

result = input1DF.join(input2DF,[input1DF.Name==input2DF.Name,input1DF.Age==input2DF.Age],how='outer').select(input1DF.Name,input1DF.Age,input2DF.Gender)
result.show()


+--------+---+
|    Name|Age|
+--------+---+
| Monisha| 23|
|  Arvind| 24|
|Rishitha| 24|
|  Anusha| 24|
| Gayatri| 25|
+--------+---+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
| Monisha| 23|     F|
|  Arvind| 24|     M|
|Rishitha| 24|     F|
+--------+---+------+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
|  Anusha| 24|  null|
|  Arvind| 24|     M|
| Gayatri| 25|  null|
| Monisha| 23|     F|
|Rishitha| 24|     F|
+--------+---+------+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
|  Anusha| 24|  null|
|  Arvind| 24|     M|
| Gayatri| 25|  null|
| Monisha| 23|     F|
|Rishitha| 24|     F|
+--------+---+------+



Best approach - Automated

In [25]:
input1DF = spark.read.format('csv').option('header',True).load('sample_data\input1.csv')
input1DF.show()
input2DF = spark.read.format('csv').option('header',True).load('sample_data\input2.csv')
input2DF.show()

listA = set(input1DF.columns)-set(input2DF.columns)
listB = set(input2DF.columns)-set(input1DF.columns)

for i in listA:
    input2DF = input2DF.withColumn(i,lit(None))

for i in listB:
    input1DF = input1DF.withColumn(i,lit(None))
    
resut = input1DF.union(input2DF)

result.show()

+--------+---+
|    Name|Age|
+--------+---+
| Monisha| 23|
|  Arvind| 24|
|Rishitha| 24|
|  Anusha| 24|
| Gayatri| 25|
+--------+---+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
| Monisha| 23|     F|
|  Arvind| 24|     M|
|Rishitha| 24|     F|
+--------+---+------+

+--------+---+------+
|    Name|Age|Gender|
+--------+---+------+
|  Anusha| 24|  null|
|  Arvind| 24|     M|
| Gayatri| 25|  null|
| Monisha| 23|     F|
|Rishitha| 24|     F|
+--------+---+------+



Apply line break every 5th occurance from | delimited input.txt

In [49]:
from pyspark.sql.functions import regexp_replace,explode,split

input = spark.read.csv('sample_data\input.txt')

input.show(truncate=False)

input = input.withColumn("chk",regexp_replace("_c0","(.*?\\|){4}","$0-"))

input.show(truncate=False)

input = input.withColumn('col_explode',explode(split('chk','\|-')))

input.select(input.col_explode).show()

result = input.select(input.col_explode)

result.show()




+--------------------------------------------------------------------------------------------------------+
|_c0                                                                                                     |
+--------------------------------------------------------------------------------------------------------+
|A|BE|1|25|B|BSC|2|27|A|BE|1|25|B|BSC|2|27|A|BE|1|25|B|BSC|2|27|A|BE|1|25|B|BSC|2|27|A|BE|1|25|B|BSC|2|27|
+--------------------------------------------------------------------------------------------------------+

+--------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------+
|_c0                                                                                                     |chk                                                                                                              |
+---------------------

In [55]:
result.select(split('col_explode','\|')).show()

result.rdd.map(lambda i:i).collect()




+--------------------------+
|split(col_explode, \|, -1)|
+--------------------------+
|            [A, BE, 1, 25]|
|           [B, BSC, 2, 27]|
|            [A, BE, 1, 25]|
|           [B, BSC, 2, 27]|
|            [A, BE, 1, 25]|
|           [B, BSC, 2, 27]|
|            [A, BE, 1, 25]|
|           [B, BSC, 2, 27]|
|            [A, BE, 1, 25]|
|           [B, BSC, 2, 27]|
+--------------------------+



[Row(col_explode='A|BE|1|25'),
 Row(col_explode='B|BSC|2|27'),
 Row(col_explode='A|BE|1|25'),
 Row(col_explode='B|BSC|2|27'),
 Row(col_explode='A|BE|1|25'),
 Row(col_explode='B|BSC|2|27'),
 Row(col_explode='A|BE|1|25'),
 Row(col_explode='B|BSC|2|27'),
 Row(col_explode='A|BE|1|25'),
 Row(col_explode='B|BSC|2|27')]

In [None]:
result.rdd.map(lambda i:len(i)).collect()


In [None]:
result_rdd = result.rdd.map(lambda i:i[0].split('|'))

result = result_rdd.toDF(['Name','Qualification','S.no','Age']).show()