https://github.com/jdorfman/awesome-json-datasets

In [0]:
import requests

repo_url = "https://api.exchangerate-api.com/v4/latest/USD"
response = requests.get(repo_url)

print(response.json())  # Lists all files in the repository

json_data=response.json()




In [0]:
json_schema = StructType([
    StructField("provider", StringType(), True),
    StructField("terms", StringType(), True),
    StructField("base", StringType(), True),
    StructField("date", StringType(), True),
    StructField("time_last_updated", StringType(), True),
    StructField("rates", MapType(StringType(), StringType()), True)  # MapType for currency pairs
])

In [0]:
df = spark.createDataFrame([json_data], schema=json_schema)

df.show()

+--------------------+--------------------+----+----------+-----------------+--------------------+
|            provider|               terms|base|      date|time_last_updated|               rates|
+--------------------+--------------------+----+----------+-----------------+--------------------+
|https://www.excha...|https://www.excha...| USD|2025-04-05|       1743811201|{FJD -> 2.32, MXN...|
+--------------------+--------------------+----+----------+-----------------+--------------------+



In [0]:
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType

df_corrected = df.withColumn("rates", col("rates").cast("map<string,double>"))
df_corrected.show()
df_exploded = df.select(explode(col("rates")).alias("currency", "rate"))
df_exploded=df_exploded.withColumn("rate",col("rate").cast(DoubleType()))
df_exploded.show()


+--------------------+--------------------+----+----------+-----------------+--------------------+
|            provider|               terms|base|      date|time_last_updated|               rates|
+--------------------+--------------------+----+----------+-----------------+--------------------+
|https://www.excha...|https://www.excha...| USD|2025-04-05|       1743811201|{FJD -> 2.32, MXN...|
+--------------------+--------------------+----+----------+-----------------+--------------------+

+--------+--------+
|currency|    rate|
+--------+--------+
|     FJD|    2.32|
|     MXN|   20.42|
|     TVD|    1.65|
|     SCR|   14.79|
|     CDF| 2874.56|
|     GTQ|     7.7|
|     BBD|     2.0|
|     CLP|  950.17|
|     UGX| 3641.41|
|     HNL|   25.56|
|     ZAR|   19.08|
|     TND|    3.07|
|     STN|    22.3|
|     SLE|   22.72|
|     SLL|22718.05|
|     BSD|     1.0|
|     SDG|  510.43|
|     IQD| 1309.89|
|     GMD|   72.64|
|     CUP|    24.0|
+--------+--------+
only showing top 20 rows

In [0]:
df_exploded.printSchema()

root
 |-- currency: string (nullable = false)
 |-- rate: double (nullable = true)



In [0]:
df_exploded.show()

+--------+--------+
|currency|    rate|
+--------+--------+
|     FJD|    2.32|
|     MXN|   20.42|
|     TVD|    1.65|
|     SCR|   14.79|
|     CDF| 2874.56|
|     GTQ|     7.7|
|     BBD|     2.0|
|     CLP|  950.17|
|     UGX| 3641.41|
|     HNL|   25.56|
|     ZAR|   19.08|
|     TND|    3.07|
|     STN|    22.3|
|     SLE|   22.72|
|     SLL|22718.05|
|     BSD|     1.0|
|     SDG|  510.43|
|     IQD| 1309.89|
|     GMD|   72.64|
|     CUP|    24.0|
+--------+--------+
only showing top 20 rows



In [0]:
import requests

repo_url="https://data.parliament.scot/api/departments"

response =requests.get(repo_url)

print(response.json())

req_data= response.json()


[{'Id': 2, 'DirectorateID': 31, 'Name': 'MSP'}, {'Id': 3, 'DirectorateID': 32, 'Name': 'MSP'}, {'Id': 4, 'DirectorateID': 33, 'Name': 'MSP'}, {'Id': 5, 'DirectorateID': 34, 'Name': 'MSP'}, {'Id': 6, 'DirectorateID': 35, 'Name': 'MSP'}, {'Id': 7, 'DirectorateID': 2, 'Name': 'APS'}, {'Id': 8, 'DirectorateID': 52, 'Name': 'Business Information Technology Office (Programmes)'}, {'Id': 9, 'DirectorateID': 65, 'Name': 'Broadcasting'}, {'Id': 10, 'DirectorateID': 52, 'Name': "Digital Services Group (Head's Office)"}, {'Id': 11, 'DirectorateID': 52, 'Name': 'Business Information Technology Office (Admin. Support)'}, {'Id': 12, 'DirectorateID': 52, 'Name': 'Business Information Technology Office (Applications)'}, {'Id': 13, 'DirectorateID': 52, 'Name': 'Business Information Technology Office (Delivery)'}, {'Id': 14, 'DirectorateID': 52, 'Name': 'Business Information Technology Office (Development)'}, {'Id': 15, 'DirectorateID': 52, 'Name': 'Business Information Technology Office (Infrastructure

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, MapType, DoubleType, LongType,IntegerType
from pyspark.sql.functions import col


json_schema= StructType([
  StructField("Id",IntegerType(),True),
  StructField("DirectorateID",IntegerType(),True),
  StructField("Name",StringType(),True)
])



In [0]:
df_tr = spark.createDataFrame(req_data,schema=json_schema)
df_tr.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- DirectorateID: integer (nullable = true)
 |-- Name: string (nullable = true)



In [0]:
df_tr.show()

+---+-------------+--------------------+
| Id|DirectorateID|                Name|
+---+-------------+--------------------+
|  2|           31|                 MSP|
|  3|           32|                 MSP|
|  4|           33|                 MSP|
|  5|           34|                 MSP|
|  6|           35|                 MSP|
|  7|            2|                 APS|
|  8|           52|Business Informat...|
|  9|           65|        Broadcasting|
| 10|           52|Digital Services ...|
| 11|           52|Business Informat...|
| 12|           52|Business Informat...|
| 13|           52|Business Informat...|
| 14|           52|Business Informat...|
| 15|           52|Business Informat...|
| 16|           52|Business Informat...|
| 17|           52|Business Informat...|
| 18|           52|Business Informat...|
| 19|           65|Chamber Office (B...|
| 20|           65|Chamber Office (C...|
| 21|           65|Chamber Office (D...|
+---+-------------+--------------------+
only showing top

In [0]:
import requests

repo_url="http://vocab.nic.in/rest.php/states/json"

response =requests.get(repo_url)

print(response.json())

req_data= response.json()


{'states': [{'state': {'state_id': 'AN', 'state_name': 'Andaman and Nicobar Island (UT)'}}, {'state': {'state_id': 'AP', 'state_name': 'Andhra Pradesh'}}, {'state': {'state_id': 'AR', 'state_name': 'Arunachal Pradesh'}}, {'state': {'state_id': 'AS', 'state_name': 'Assam'}}, {'state': {'state_id': 'BR', 'state_name': 'Bihar'}}, {'state': {'state_id': 'CH', 'state_name': 'Chandigarh (UT)'}}, {'state': {'state_id': 'CG', 'state_name': 'Chhattisgarh'}}, {'state': {'state_id': 'DN', 'state_name': 'Dadra and Nagar Haveli (UT)'}}, {'state': {'state_id': 'DD', 'state_name': 'Daman and Diu (UT)'}}, {'state': {'state_id': 'DL', 'state_name': 'Delhi (NCT)'}}, {'state': {'state_id': 'GA', 'state_name': 'Goa'}}, {'state': {'state_id': 'GJ', 'state_name': 'Gujarat'}}, {'state': {'state_id': 'HR', 'state_name': 'Haryana'}}, {'state': {'state_id': 'HP', 'state_name': 'Himachal Pradesh'}}, {'state': {'state_id': 'JK', 'state_name': 'Jammu and Kashmir (UT)'}}, {'state': {'state_id': 'JH', 'state_name': 

In [0]:
df = spark.createDataFrame([req_data])
df.show()

+--------------------+
|              states|
+--------------------+
|[{state -> {state...|
+--------------------+



In [0]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

# Define schema for the nested "state" field
state_schema = StructType([
    StructField("state_id", StringType(), True),
    StructField("state_name", StringType(), True)
])

# Define schema for the "states" list
json_schema = StructType([
    StructField("states", ArrayType(StructType([
        StructField("state", state_schema)  # Nested structure
    ])), True)
])


In [0]:
states= spark.createDataFrame([req_data],schema=json_schema)
states.printSchema()

states.show()

root
 |-- states: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- state: struct (nullable = true)
 |    |    |    |-- state_id: string (nullable = true)
 |    |    |    |-- state_name: string (nullable = true)

+--------------------+
|              states|
+--------------------+
|[{{AN, Andaman an...|
+--------------------+



In [0]:
from pyspark.sql.functions import explode, col

# Explode the nested array properly
states_flattened = states.select(explode(col("states")).alias("state"))
#states_flattened = states_flattened.select(explode(col("state")).alias("state"))
states_flattened.show()

# Extract individual fields from the nested structure
df_cleaned = states_flattened.select(
    col("state.state.state_id").alias("state_id"),
    col("state.state.state_name").alias("state_name")
)

df_cleaned.show()



+--------------------+
|               state|
+--------------------+
|{{AN, Andaman and...|
|{{AP, Andhra Prad...|
|{{AR, Arunachal P...|
|       {{AS, Assam}}|
|       {{BR, Bihar}}|
|{{CH, Chandigarh ...|
|{{CG, Chhattisgarh}}|
|{{DN, Dadra and N...|
|{{DD, Daman and D...|
| {{DL, Delhi (NCT)}}|
|         {{GA, Goa}}|
|     {{GJ, Gujarat}}|
|     {{HR, Haryana}}|
|{{HP, Himachal Pr...|
|{{JK, Jammu and K...|
|   {{JH, Jharkhand}}|
|   {{KA, Karnataka}}|
|      {{KL, Kerala}}|
|  {{LK, Ladakh(UT)}}|
|{{LD, Lakshadweep...|
+--------------------+
only showing top 20 rows

+--------+--------------------+
|state_id|          state_name|
+--------+--------------------+
|      AN|Andaman and Nicob...|
|      AP|      Andhra Pradesh|
|      AR|   Arunachal Pradesh|
|      AS|               Assam|
|      BR|               Bihar|
|      CH|     Chandigarh (UT)|
|      CG|        Chhattisgarh|
|      DN|Dadra and Nagar H...|
|      DD|  Daman and Diu (UT)|
|      DL|         Delhi (NCT)|
|     