In [0]:
import requests

# Download data from URL
url = "https://api.fda.gov/drug/drugsfda.json?limit=100"
response = requests.get(url)

In [0]:
#lets look into the data
response.content

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

# Define schema for active_ingredients inside products
ingredient_schema = StructType([
    StructField("name", StringType(), True),
    StructField("strength", StringType(), True)
])

# Define schema for each product
product_schema = StructType([
    StructField("product_number", StringType(), True),
    StructField("reference_drug", StringType(), True),
    StructField("brand_name", StringType(), True),
    StructField("active_ingredients", ArrayType(ingredient_schema), True),
    StructField("reference_standard", StringType(), True),
    StructField("dosage_form", StringType(), True),
    StructField("route", StringType(), True),
    StructField("marketing_status", StringType(), True),
    StructField("te_code", StringType(), True)
])

# Final schema
schema = StructType([
    StructField("application_number", StringType(), True),
    StructField("sponsor_name", StringType(), True),
    StructField("products", ArrayType(product_schema), True)
])

# Create the DataFrame
df_drugs = spark.createDataFrame(response.json()["results"], schema=schema)
display(df_drugs)


1. Pyspark.sql.types : module provide classes to define the schema of data frames and specify the data type of columns thes types are used when creating data frames or defining schema explicity.
Atomic Types
These represent single-value data types:
1. StringType
2. IntegerType
3. LongType
4. FloatType
5. BooleanType
6. DateType
7. TimestampType
8. BinaryType

Complex Types

These repersent more structured data:
1. ArrayType: Represent an arrary of elements of specific type.
* example : ArrayType(StringType()) for an array of strings.

2. MapType: Represent a map(key-value pairs) with specific key and value types.

* Example: MapType(StringType(),IntegerType()) for a map with stringKEy and integer Values.

3. StructType: Represent a structure(like a row) with multiple fields.
* example:
StructType([
  StructField("name",StringType(),True),
  StructField("age",IntegerType(),True)
])

4. Decimal and Numeric Types
* DecimalType(10,2) 

5. NullType: Represents null values (rarely used explicitly).

To use it first import
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, MapType, DecimalType

In [0]:
df_selected = df_drugs.select("application_number","sponsor_name","products.product_number","products.brand_name","products.active_ingredients.name","products.active_ingredients.strength","products.dosage_form","products.route","products.marketing_status","products.te_code")

In [0]:
df_selected.display(5)

In [0]:
jsonline_df = spark.read.format("json")\
    .option("inferShema","true")\
        .option("mode","PERMISSIVE")\
            .option("nullValue", "") \
    .load("/Volumes/workspace/default/json/Untitledline_delimited.json")


display(jsonline_df)

In [0]:
jsonline_df = jsonline_df.na.drop()

In [0]:
display(jsonline_df)

In [0]:
jsonSingleline_df = spark.read.format("json")\
    .option("inferShema","true")\
        .option("mode","PERMISSIVE")\
    .load("/Volumes/workspace/default/json/single_file_json.json")

jsonSingleline_df = jsonSingleline_df.na.drop(how="all")
display(jsonSingleline_df)

In [0]:
jsoncorruptedfile_df= spark.read.format("json")\
    .option("inferSchema","true")\
            .option("mode","PERMISSIVE")\
            .load("/Volumes/workspace/default/json/corrupted.json")
#.option("mode","DROPMALFORMED")\

display(jsoncorruptedfile_df)

In [0]:
jsonmultiline_df = spark.read.format("json")\
    .option("inferSchema","true")\
        .option("mode","PERMISSIVE")\
            .option("multiline","true")\
                .load("/Volumes/workspace/default/json/Multi_line_correct.json")


display(jsonmultiline_df)