In [36]:
!pip install pyspark requests google-cloud-storage google-cloud-bigquery




In [None]:
import requests
import json
from google.cloud import storage
from datetime import datetime,timedelta
yesterday=datetime.now()-timedelta(days=1)
yesterday_date = yesterday.strftime('%Y-%m-%d')
latitude=19.07
longitude=72.88
timezone="Asia/Kolkata"
api_url = f"https://api.open-meteo.com/v1/forecast?latitude={latitude}&longitude={longitude}&hourly=temperature_2m,precipitation&start_date={yesterday_date}&end_date={yesterday_date}&timezone={timezone}"
result=requests.get(api_url)
if result.status_code==200:
  data=result.json()
  with open(f"weather_date-{yesterday_date}.json","w")as f:
    json.dump(data,f,indent=2)
    client = storage.Client.from_service_account_json('you-key ')  # Replace with your local key path
    bucket = client.get_bucket('your-bucket')  # Replace with your bucket
    blob_name = f'weather_raw_{datetime.now().strftime("%Y-%m-%d")}.json'
    blob = bucket.blob(blob_name)
    blob.upload_from_string(json.dumps(data))
    print(f"Extracted and saved to GCS: {blob_name}")
else:
  print("Failed to fetch data:", result.status_code)






Extracted and saved to GCS: weather_raw_2025-10-25.json


In [37]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from datetime import datetime,timedelta
spark=SparkSession.builder.appName("Weather_ETL")\
.config("spark.driver.memory","1g")\
.config("spark.executor.memory","1g").getOrCreate()
yesterday_date=(datetime.now()-timedelta(days=1)).strftime('%d-%m-%Y')
city="Mumbai"
df=spark.read.option("multiline","True").json("weather_date-2025-10-24.json")
df_zip=df.select(F.arrays_zip('hourly.temperature_2m','hourly.precipitation').alias('zipped'))
df_explode=df_zip.select(F.explode('zipped').alias ("row"))
df_clean=df_explode.select(F.col('row.temperature_2m'),F.col('row.precipitation'))
df_tranform=df_clean.agg(
    F.round(F.max('temperature_2m'),2).alias('Max_temperature'),
    F.round(F.min('temperature_2m'),2).alias('Min_temperature'),
    F.round(F.avg('temperature_2m'),2).alias('Avg_temperature'),
    F.round(F.sum('precipitation'),2).alias('Total_precipitation'),
    F.round(F.avg('precipitation'),2).alias('Avg_precipitation')
)
df_final=df_tranform\
         .withColumn('Date',F.lit(yesterday_date))\
         .withColumn('City',F.lit(city))\
         .withColumn('Max_temperature',F.concat(F.col('Max_temperature'),F.lit('°C')))\
         .withColumn('Min_temperature',F.concat(F.col('Min_temperature'),F.lit('°C')))\
         .withColumn('Avg_temperature',F.concat(F.col('Avg_temperature'),F.lit('°C')))\
         .withColumn('Total_precipitation',F.concat(F.col('Total_precipitation'),F.lit('mm')))\
         .withColumn('Avg_precipitation',F.concat(F.col('Avg_precipitation'),F.lit('mm')))\
         .select('Date','City','Max_temperature','Min_temperature','Avg_temperature',
                 'Total_precipitation','Avg_precipitation')
df_final.coalesce(1).write.mode('overwrite').option('header',True).csv(f'Transformed_weather_data')
print("Transformed!")

Transformed!


In [35]:
from google.cloud import bigquery
import glob
project_id="your-project-id"
dataset_id="weather_dataset"
table_id="Mumbai_weather_analysis"
csv_directory="Transformed_weather_data"
key_path="your-key"

Data_schema = [
        bigquery.SchemaField("Date", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("City", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("Max_temperature", "STRING"),
        bigquery.SchemaField("Min_temperature", "STRING"),
        bigquery.SchemaField("Avg_temperature", "STRING"),
        bigquery.SchemaField("Total_precipitation", "STRING"),
        bigquery.SchemaField("Avg_precipitation", "STRING")
]

client=bigquery.Client.from_service_account_json(key_path)
try:
  client.get_dataset(dataset_id)
except :
  client.create_dataset(dataset_id)

table_ref=f"{project_id}.{dataset_id}.{table_id}"

csv_file=glob.glob(f"{csv_directory}/part-*.csv")
if not csv_file:
   raise FileNotFoundError(f"No CSV found in {csv_directory}")
csv_file=csv_file[0]

job_config = bigquery.LoadJobConfig(
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1,
    schema=Data_schema,
    write_disposition=bigquery.WriteDisposition.WRITE_APPEND
)
with open(csv_file,'rb')as f:
   job = client.load_table_from_file(f, table_ref, job_config=job_config)

job.result()
print(f"Appended data to {table_ref}")

Appended data to subtle-seer-472708-q3.weather_dataset.Mumbai_weather_analysis
