In [1]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q htt  https://archive.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
!tar xf spark-2.3.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.1-bin-hadoop2.7"

import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com (91.189.88.142)] [Connecting to security.u0% [Connecting to archive.ubuntu.com (91.189.88.142)] [Connecting to security.u0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.142)                                                                               Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.142)                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.142)                                                                               Hit:4 https://developer.download.nvidia.com/comp

# **Read the 3 different files**

In [2]:
df1=spark.read.json("/content/1.json", multiLine=True);
df2=spark.read.json("/content/2.json", multiLine=True);
df3=spark.read.json("/content/3.json", multiLine=True);
df1.printSchema()
df2.printSchema()
df3.printSchema()

root
 |-- A: struct (nullable = true)
 |    |-- B: long (nullable = true)

root
 |-- A: struct (nullable = true)
 |    |-- B: long (nullable = true)
 |-- C: long (nullable = true)

root
 |-- A: struct (nullable = true)
 |    |-- B: long (nullable = true)
 |    |-- D: long (nullable = true)
 |-- C: long (nullable = true)



# Importing the functions 

In [3]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [4]:
def read_nested_json(df):
    column_list = []
    for column_name in df.schema.names:
        if isinstance(df.schema[column_name].dataType, ArrayType):
            df = df.withColumn(column_name,explode(column_name))
            column_list.append(column_name)
        elif isinstance(df.schema[column_name].dataType, StructType):
            for field in df.schema[column_name].dataType.fields:
                column_list.append(col(column_name + "." + field.name).alias(column_name + "_" + field.name))
        else:
            column_list.append(column_name)
    df = df.select(column_list)
    return df

In [5]:
def flatten(df):
  read_nested_json_flag = True
  while read_nested_json_flag:
    df = read_nested_json(df);
    read_nested_json_flag = False
    for column_name in df.schema.names:
      if isinstance(df.schema[column_name].dataType, ArrayType):
        read_nested_json_flag = True
      elif isinstance(df.schema[column_name].dataType, StructType):
        read_nested_json_flag = True;
  return df;

# **Call the function to flatten the files**

In [6]:
df1=flatten(df1);
df1.show();
df2=flatten(df2);
df2.show();
df3=flatten(df3);
df3.show();

+---+
|A_B|
+---+
|  1|
+---+

+---+---+
|A_B|  C|
+---+---+
|  1|  2|
+---+---+

+---+---+---+
|A_B|A_D|  C|
+---+---+---+
|  1|  3|  2|
+---+---+---+



In [7]:
monitor=spark.sparkContext.accumulator(0);

# **Create a directory to store the files which are not following the standard schema**

In [8]:
mkdir '/content/Corrupted Files Storage'

# Harmonize the Schema 

In [9]:
def harmonize_schemas(standard_schema, source_file):
    left_types = {f.name: f.dataType for f in standard_schema}
    right_types = {f.name: f.dataType for f in source_file.schema}
    left_fields = set((f.name, f.dataType,f.nullable) for f in standard_schema)
    right_fields = set((f.name, f.dataType,f.nullable) for f in source_file.schema)
    global monitor;
    monitor.add(1);
    file_number=monitor.value;
    for l_name, l_type ,l_nullable in left_fields.difference(right_fields):
      if (l_name in right_types):
        r_type=right_types[l_name];
        if (l_type!=r_type):
          source_file = source_file.withColumn(l_name, source_file[l_name].cast(l_type));
        if (l_nullable!=r_nullable):
          source_file.schema[l_name].nullable = l_nullable;
      source_file = source_file.withColumn(l_name, lit(None).cast(l_type));
      source_file.schema[l_name].nullable = l_nullable;
      print("For the File-->",file_number,",the missing columns w.r.t. standard_schema :",(l_name,l_type))
    source_file=source_file.select(list(left_types.keys()));
    for r_name, r_type ,r_nullable in right_fields.difference(left_fields):
      print("For the File-->",file_number,",the extra columns w.r.t. standard_schema :",(r_name,r_type));
    if(right_fields==left_fields):
      print("For the File-->",file_number,",the schema is exactly matched with standard_schema");
    else:
      source_file.coalesce(1).write.mode('overwrite').option('header','true').csv('/content/Corrupted Files Storage/'+str(file_number)+'.csv')
    return source_file;

# **Make the Schema Standard**

In [10]:
standard_schema=df3.schema

# Make the Schema standard & union those 

In [11]:
df1=harmonize_schemas(standard_schema, df1);
df2=harmonize_schemas(standard_schema, df2);
df1.unionByName(df2).unionByName(df3).show()

For the File--> 1 ,the missing columns w.r.t. standard_schema : ('C', LongType)
For the File--> 1 ,the missing columns w.r.t. standard_schema : ('A_D', LongType)
For the File--> 2 ,the missing columns w.r.t. standard_schema : ('A_D', LongType)
+---+----+----+
|A_B| A_D|   C|
+---+----+----+
|  1|null|null|
|  1|null|   2|
|  1|   3|   2|
+---+----+----+

