In [0]:
dbutils.fs.put("/scenarios/dynamic_columns.csv","""id,name,loc,emailid,phone
1,ravi
2,ram,bangalore
3,prasad,chennai,sample@gmail.com,989766678""",True)

Wrote 92 bytes.
Out[1]: True

In [0]:
df = spark.read.csv("/scenarios/dynamic_columns.csv",header=True)
display(df)

id,name,loc,emailid,phone
1,ravi,,,
2,ram,bangalore,,
3,prasad,chennai,sample@gmail.com,989766678.0


In [0]:
dbutils.fs.put("/scenarios/dynamic_withoutcol.csv","""
1,ravi
2,ram,bangalore
3,prasad,chennai,sample@gmail.com,989766678""",True)

Wrote 67 bytes.
Out[3]: True

In [0]:
df1 = spark.read.csv("/scenarios/dynamic_withoutcol.csv")
display(df1)

_c0,_c1
1,ravi
2,ram
3,prasad


In [0]:
# in the above df without hearder,the df will read based on the first row,since first row had only 2 columns,df was created with only 2 columns. we need to handle this scenario

In [0]:
df1 = spark.read.text("/scenarios/dynamic_withoutcol.csv")
display(df1)

value
"1,ravi"
"2,ram,bangalore"
"3,prasad,chennai,sample@gmail.com,989766678"


In [0]:
# when you read it as text file, df will treat as single column and entire data is available.
# we can use split function 

In [0]:
from pyspark.sql.functions import split,col
df1 = df1.withColumn("splittable_col",split("value",",").alias("splittable_col")).drop("value")
display(df1)

splittable_col
List()
"List(1, ravi)"
"List(2, ram, bangalore)"
"List(3, prasad, chennai, sample@gmail.com, 989766678)"


In [0]:
# dynamically can create new columns, for that i need to know the lenght of the column. we can use size function


In [0]:
from pyspark.sql.functions import size
df1.select('splittable_col',size('splittable_col')).show(truncate=False)

+-------------------------------------------------+--------------------+
|splittable_col                                   |size(splittable_col)|
+-------------------------------------------------+--------------------+
|[]                                               |1                   |
|[1, ravi]                                        |2                   |
|[2, ram, bangalore]                              |3                   |
|[3, prasad, chennai, sample@gmail.com, 989766678]|5                   |
+-------------------------------------------------+--------------------+



In [0]:
# take max value from the size

from pyspark.sql.functions import max
df1.select(max(size('splittable_col'))).collect()[0][0]

Out[12]: 5

In [0]:
for i in range(df1.select(max(size('splittable_col'))).collect()[0][0]):
    df1 =df1.withColumn("col"+str(i),df1['splittable_col'][i])
    display(df1)

splittable_col,col0
List(),
"List(1, ravi)",1.0
"List(2, ram, bangalore)",2.0
"List(3, prasad, chennai, sample@gmail.com, 989766678)",3.0


splittable_col,col0,col1
List(),,
"List(1, ravi)",1.0,ravi
"List(2, ram, bangalore)",2.0,ram
"List(3, prasad, chennai, sample@gmail.com, 989766678)",3.0,prasad


splittable_col,col0,col1,col2
List(),,,
"List(1, ravi)",1.0,ravi,
"List(2, ram, bangalore)",2.0,ram,bangalore
"List(3, prasad, chennai, sample@gmail.com, 989766678)",3.0,prasad,chennai


splittable_col,col0,col1,col2,col3
List(),,,,
"List(1, ravi)",1.0,ravi,,
"List(2, ram, bangalore)",2.0,ram,bangalore,
"List(3, prasad, chennai, sample@gmail.com, 989766678)",3.0,prasad,chennai,sample@gmail.com


splittable_col,col0,col1,col2,col3,col4
List(),,,,,
"List(1, ravi)",1.0,ravi,,,
"List(2, ram, bangalore)",2.0,ram,bangalore,,
"List(3, prasad, chennai, sample@gmail.com, 989766678)",3.0,prasad,chennai,sample@gmail.com,989766678.0


In [0]:
final_df = df1.drop('splittable_col')
display(final_df)

col0,col1,col2,col3,col4
,,,,
1.0,ravi,,,
2.0,ram,bangalore,,
3.0,prasad,chennai,sample@gmail.com,989766678.0
